diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index e8bd73a220701..2799a3e78b04d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5038,11 +5038,11 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64; case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64; case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64; - case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_t16_e64; - case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_t16_e64; - case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_t16_e64; - case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_t16_e64; - case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_t16_e64; + case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64; + case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64; + case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64; + case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64; + case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64; case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64; case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64; case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64; diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index d258e060ba3e1..7d4393b653a75 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -5,6 +5,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f16: @@ -179,6 +180,32 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_f16: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7] +; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1150-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 @@ -325,6 +352,29 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: fast_frem_f16: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7] +; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_rcp_f16_e32 v3, v2 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 @@ -471,6 +521,29 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: unsafe_frem_f16: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7] +; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_rcp_f16_e32 v3, v2 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 @@ -679,6 +752,44 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_f32: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 +; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v5, v4 +; GFX1150-NEXT: s_denorm_mode 15 +; GFX1150-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX1150-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX1150-NEXT: v_fmac_f32_e32 v6, v7, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v2, v1 +; GFX1150-NEXT: v_trunc_f32_e32 v3, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 +; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 @@ -817,6 +928,29 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: fast_frem_f32: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_rcp_f32_e32 v3, v2 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX1150-NEXT: v_trunc_f32_e32 v3, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 +; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 @@ -955,6 +1089,29 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: unsafe_frem_f32: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_rcp_f32_e32 v3, v2 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX1150-NEXT: v_trunc_f32_e32 v3, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 +; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 @@ -1176,6 +1333,41 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_f64: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v12, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_b64 v[0:1], v12, s[6:7] +; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[0:1] +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX1150-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX1150-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX1150-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX1150-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX1150-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] +; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %r0 = load double, ptr addrspace(1) %in1, align 8 %r1 = load double, ptr addrspace(1) %in2, align 8 @@ -1370,6 +1562,37 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: fast_frem_f64: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v10, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[6:7] +; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[0:1] +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GFX1150-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %r0 = load double, ptr addrspace(1) %in1, align 8 %r1 = load double, ptr addrspace(1) %in2, align 8 @@ -1564,6 +1787,37 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: unsafe_frem_f64: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v10, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[6:7] +; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[0:1] +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GFX1150-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %r0 = load double, ptr addrspace(1) %in1, align 8 %r1 = load double, ptr addrspace(1) %in2, align 8 @@ -1832,6 +2086,47 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_v2f16: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX1150-NEXT: s_waitcnt vmcnt(1) +; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX1150-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v3, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_trunc_f16_e32 v4, v4 +; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f16_e32 v5, v4, v3 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX1150-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v5 +; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8 @@ -2253,6 +2548,71 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_v4f16: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v4, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7] +; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 +; GFX1150-NEXT: s_waitcnt vmcnt(1) +; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX1150-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mixlo_f16 v6, v0, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v5, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_trunc_f16_e32 v6, v6 +; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_fmac_f16_e32 v7, v6, v5 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX1150-NEXT: v_rcp_f32_e32 v5, v5 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel_hi:[1,0,0] +; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 +; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-NEXT: v_fma_f16 v0, v5, v2, v0 +; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX1150-NEXT: v_rcp_f32_e32 v5, v5 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 +; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f16_e32 v6, v5, v2 +; GFX1150-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX1150-NEXT: v_rcp_f32_e32 v2, v2 +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_mixlo_f16 v2, v1, v2, 0 op_sel_hi:[1,0,0] +; GFX1150-NEXT: v_div_fixup_f16 v2, v2, v3, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_trunc_f16_e32 v2, v2 +; GFX1150-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f16_e32 v1, v2, v3 +; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v6 +; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16 @@ -2557,6 +2917,66 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_v2f32: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v4, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7] +; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 +; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v7, v6 +; GFX1150-NEXT: s_denorm_mode 15 +; GFX1150-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX1150-NEXT: v_mul_f32_e32 v8, v5, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v9, -v6, v8, v5 +; GFX1150-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v5, -v6, v8, v5 +; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v3, v1 +; GFX1150-NEXT: v_trunc_f32_e32 v5, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1150-NEXT: v_fma_f32 v1, v5, v3, v1 +; GFX1150-NEXT: v_div_scale_f32 v5, null, v2, v2, v0 +; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v6, v5 +; GFX1150-NEXT: s_denorm_mode 15 +; GFX1150-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX1150-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX1150-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v2, v0 +; GFX1150-NEXT: v_trunc_f32_e32 v3, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX1150-NEXT: v_fmac_f32_e32 v0, v3, v2 +; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4 %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8 @@ -3053,6 +3473,110 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_v4f32: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v8, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_b128 v[0:3], v8, s[6:7] +; GFX1150-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64 +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 +; GFX1150-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v11, v10 +; GFX1150-NEXT: s_denorm_mode 15 +; GFX1150-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v11, v12, v11 +; GFX1150-NEXT: v_mul_f32_e32 v12, v9, v11 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v13, -v10, v12, v9 +; GFX1150-NEXT: v_fmac_f32_e32 v12, v13, v11 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v9, -v10, v12, v9 +; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: v_div_fmas_f32 v9, v9, v11, v12 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f32 v9, v9, v7, v3 +; GFX1150-NEXT: v_trunc_f32_e32 v9, v9 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 +; GFX1150-NEXT: v_fma_f32 v3, v9, v7, v3 +; GFX1150-NEXT: v_div_scale_f32 v9, null, v6, v6, v2 +; GFX1150-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v10, v9 +; GFX1150-NEXT: s_denorm_mode 15 +; GFX1150-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v10, v11, v10 +; GFX1150-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX1150-NEXT: v_fmac_f32_e32 v11, v12, v10 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f32 v7, v7, v6, v2 +; GFX1150-NEXT: v_trunc_f32_e32 v7, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 +; GFX1150-NEXT: v_fma_f32 v2, v7, v6, v2 +; GFX1150-NEXT: v_div_scale_f32 v7, null, v5, v5, v1 +; GFX1150-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v9, v7 +; GFX1150-NEXT: s_denorm_mode 15 +; GFX1150-NEXT: v_fma_f32 v10, -v7, v9, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v9, v10, v9 +; GFX1150-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v11, -v7, v10, v6 +; GFX1150-NEXT: v_fmac_f32_e32 v10, v11, v9 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v6, -v7, v10, v6 +; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f32 v6, v6, v5, v1 +; GFX1150-NEXT: v_trunc_f32_e32 v6, v6 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX1150-NEXT: v_fma_f32 v1, v6, v5, v1 +; GFX1150-NEXT: v_div_scale_f32 v6, null, v4, v4, v0 +; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f32_e32 v7, v6 +; GFX1150-NEXT: s_denorm_mode 15 +; GFX1150-NEXT: v_fma_f32 v9, -v6, v7, 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fmac_f32_e32 v7, v9, v7 +; GFX1150-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v10, -v6, v9, v5 +; GFX1150-NEXT: v_fmac_f32_e32 v9, v10, v7 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f32 v5, -v6, v9, v5 +; GFX1150-NEXT: s_denorm_mode 12 +; GFX1150-NEXT: v_div_fmas_f32 v5, v5, v7, v9 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v4, v0 +; GFX1150-NEXT: v_trunc_f32_e32 v5, v5 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; GFX1150-NEXT: v_fmac_f32_e32 v0, v5, v4 +; GFX1150-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4 %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16 @@ -3384,6 +3908,59 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX1150-LABEL: frem_v2f64: +; GFX1150: ; %bb.0: +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1150-NEXT: v_mov_b32_e32 v16, 0 +; GFX1150-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-NEXT: s_clause 0x1 +; GFX1150-NEXT: global_load_b128 v[0:3], v16, s[6:7] +; GFX1150-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64 +; GFX1150-NEXT: s_waitcnt vmcnt(0) +; GFX1150-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX1150-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX1150-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX1150-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] +; GFX1150-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX1150-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] +; GFX1150-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] +; GFX1150-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1] +; GFX1150-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX1150-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX1150-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX1150-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX1150-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] +; GFX1150-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] +; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] +; GFX1150-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] +; GFX1150-NEXT: global_store_b128 v16, v[0:3], s[4:5] +; GFX1150-NEXT: s_nop 0 +; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1150-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4 %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16