diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 3d21f522e97ce..e2b7a5ead2cd3 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -695,7 +695,6 @@ def constant_fold_fabs : constant_fold_unary_fp_op_rule; def constant_fold_fsqrt : constant_fold_unary_fp_op_rule; def constant_fold_flog2 : constant_fold_unary_fp_op_rule; def constant_fold_fptrunc : constant_fold_unary_fp_op_rule; -def constant_fold_fpext : constant_fold_unary_fp_op_rule; // Fold constant zero int to fp conversions. class itof_const_zero_fold_rule : GICombineRule < @@ -714,7 +713,6 @@ def constant_fold_fp_ops : GICombineGroup<[ constant_fold_fsqrt, constant_fold_flog2, constant_fold_fptrunc, - constant_fold_fpext, itof_const_zero_fold_si, itof_const_zero_fold_ui ]>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index b425b952bfc1d..906d62a33d51d 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1728,7 +1728,6 @@ static APFloat constantFoldFpUnary(const MachineInstr &MI, Result.clearSign(); return Result; } - case TargetOpcode::G_FPEXT: case TargetOpcode::G_FPTRUNC: { bool Unused; LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index ecaeff77fcb4b..639ddcba28468 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -350,7 +350,7 @@ def AArch64PostLegalizerLowering // Post-legalization combines which are primarily optimizations. def AArch64PostLegalizerCombiner : GICombiner<"AArch64PostLegalizerCombinerImpl", - [copy_prop, cast_of_cast_combines, constant_fold_fp_ops, + [copy_prop, cast_of_cast_combines, buildvector_of_truncate, integer_of_truncate, mutate_anyext_to_zext, combines_for_extload, combine_indexed_load_store, sext_trunc_sextload, diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll index e8e563135acc5..322a96aca5db2 100644 --- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll +++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll @@ -739,12 +739,14 @@ define ptr @postidx32_shalf(ptr %src, ptr %out, half %a) { ; ; GISEL-LABEL: postidx32_shalf: ; GISEL: ; %bb.0: -; GISEL-NEXT: ldr h1, [x0], #4 +; GISEL-NEXT: movi d1, #0000000000000000 +; GISEL-NEXT: ldr h2, [x0], #4 ; GISEL-NEXT: ; kill: def $h0 killed $h0 def $s0 ; GISEL-NEXT: fmov w9, s0 -; GISEL-NEXT: fcvt s2, h1 -; GISEL-NEXT: fmov w8, s1 -; GISEL-NEXT: fcmp s2, #0.0 +; GISEL-NEXT: fcvt s3, h2 +; GISEL-NEXT: fmov w8, s2 +; GISEL-NEXT: fcvt s1, h1 +; GISEL-NEXT: fcmp s3, s1 ; GISEL-NEXT: csel w8, w8, w9, mi ; GISEL-NEXT: strh w8, [x1] ; GISEL-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll index 085170c7ba381..b234ef7a5ff8b 100644 --- a/llvm/test/CodeGen/AArch64/f16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll @@ -782,16 +782,18 @@ define void @test_fccmp(half %in, ptr %out) { ; ; CHECK-CVT-GI-LABEL: test_fccmp: ; CHECK-CVT-GI: // %bb.0: -; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-GI-NEXT: fcvt s1, h0 -; CHECK-CVT-GI-NEXT: fmov s2, #5.00000000 ; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_0 -; CHECK-CVT-GI-NEXT: fmov s3, #8.00000000 -; CHECK-CVT-GI-NEXT: fcmp s1, s2 -; CHECK-CVT-GI-NEXT: ldr h2, [x8, :lo12:.LCPI29_0] +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-CVT-GI-NEXT: fcvt s2, h0 +; CHECK-CVT-GI-NEXT: ldr h1, [x8, :lo12:.LCPI29_0] +; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_1 +; CHECK-CVT-GI-NEXT: ldr h4, [x8, :lo12:.LCPI29_1] ; CHECK-CVT-GI-NEXT: fmov w8, s0 -; CHECK-CVT-GI-NEXT: fmov w9, s2 -; CHECK-CVT-GI-NEXT: fccmp s1, s3, #4, mi +; CHECK-CVT-GI-NEXT: fcvt s3, h1 +; CHECK-CVT-GI-NEXT: fmov w9, s1 +; CHECK-CVT-GI-NEXT: fcvt s4, h4 +; CHECK-CVT-GI-NEXT: fcmp s2, s3 +; CHECK-CVT-GI-NEXT: fccmp s2, s4, #4, mi ; CHECK-CVT-GI-NEXT: csel w8, w8, w9, gt ; CHECK-CVT-GI-NEXT: strh w8, [x0] ; CHECK-CVT-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll index 743d1604388de..7409bfb91454c 100644 --- a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll +++ b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll @@ -149,21 +149,33 @@ define i64 @fcvtzs_f64_i64_64(double %dbl) { } define i32 @fcvtzs_f16_i32_7(half %flt) { -; CHECK-NO16-LABEL: fcvtzs_f16_i32_7: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzs w0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_7: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzs w0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_7: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI8_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI8_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzs w0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI8_0 @@ -177,21 +189,33 @@ define i32 @fcvtzs_f16_i32_7(half %flt) { } define i32 @fcvtzs_f16_i32_15(half %flt) { -; CHECK-NO16-LABEL: fcvtzs_f16_i32_15: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzs w0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_15: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzs w0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_15: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI9_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI9_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzs w0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI9_0 @@ -205,21 +229,33 @@ define i32 @fcvtzs_f16_i32_15(half %flt) { } define i64 @fcvtzs_f16_i64_7(half %flt) { -; CHECK-NO16-LABEL: fcvtzs_f16_i64_7: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzs x0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_7: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzs x0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_7: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI10_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI10_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzs x0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI10_0 @@ -233,21 +269,33 @@ define i64 @fcvtzs_f16_i64_7(half %flt) { } define i64 @fcvtzs_f16_i64_15(half %flt) { -; CHECK-NO16-LABEL: fcvtzs_f16_i64_15: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzs x0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_15: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzs x0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_15: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI11_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI11_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzs x0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI11_0 @@ -405,21 +453,33 @@ define i64 @fcvtzu_f64_i64_64(double %dbl) { } define i32 @fcvtzu_f16_i32_7(half %flt) { -; CHECK-NO16-LABEL: fcvtzu_f16_i32_7: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzu w0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_7: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzu w0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_7: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI20_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI20_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzu w0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI20_0 @@ -433,21 +493,33 @@ define i32 @fcvtzu_f16_i32_7(half %flt) { } define i32 @fcvtzu_f16_i32_15(half %flt) { -; CHECK-NO16-LABEL: fcvtzu_f16_i32_15: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzu w0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_15: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzu w0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_15: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI21_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI21_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzu w0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI21_0 @@ -461,21 +533,33 @@ define i32 @fcvtzu_f16_i32_15(half %flt) { } define i64 @fcvtzu_f16_i64_7(half %flt) { -; CHECK-NO16-LABEL: fcvtzu_f16_i64_7: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzu x0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_7: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzu x0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_7: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI22_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI22_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzu x0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI22_0 @@ -489,21 +573,33 @@ define i64 @fcvtzu_f16_i64_7(half %flt) { } define i64 @fcvtzu_f16_i64_15(half %flt) { -; CHECK-NO16-LABEL: fcvtzu_f16_i64_15: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzu x0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_15: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzu x0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_15: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI23_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI23_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzu x0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI23_0 @@ -678,11 +774,13 @@ define half @scvtf_f16_i32_7(i32 %int) { ; ; CHECK-GI-NO16-LABEL: scvtf_f16_i32_7: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: scvtf s1, w0 -; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24 -; CHECK-GI-NO16-NEXT: fcvt h1, s1 +; CHECK-GI-NO16-NEXT: scvtf s0, w0 +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI32_0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI32_0] ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -716,11 +814,13 @@ define half @scvtf_f16_i32_15(i32 %int) { ; ; CHECK-GI-NO16-LABEL: scvtf_f16_i32_15: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: scvtf s1, w0 -; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24 -; CHECK-GI-NO16-NEXT: fcvt h1, s1 +; CHECK-GI-NO16-NEXT: scvtf s0, w0 +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI33_0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI33_0] ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -754,11 +854,13 @@ define half @scvtf_f16_i64_7(i64 %long) { ; ; CHECK-GI-NO16-LABEL: scvtf_f16_i64_7: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: scvtf s1, x0 -; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24 -; CHECK-GI-NO16-NEXT: fcvt h1, s1 +; CHECK-GI-NO16-NEXT: scvtf s0, x0 +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI34_0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI34_0] ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -792,11 +894,13 @@ define half @scvtf_f16_i64_15(i64 %long) { ; ; CHECK-GI-NO16-LABEL: scvtf_f16_i64_15: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: scvtf s1, x0 -; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24 -; CHECK-GI-NO16-NEXT: fcvt h1, s1 +; CHECK-GI-NO16-NEXT: scvtf s0, x0 +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI35_0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI35_0] ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -974,11 +1078,13 @@ define half @ucvtf_f16_i32_7(i32 %int) { ; ; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_7: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: ucvtf s1, w0 -; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24 -; CHECK-GI-NO16-NEXT: fcvt h1, s1 +; CHECK-GI-NO16-NEXT: ucvtf s0, w0 +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI44_0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI44_0] ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -1012,11 +1118,13 @@ define half @ucvtf_f16_i32_15(i32 %int) { ; ; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_15: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: ucvtf s1, w0 -; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24 -; CHECK-GI-NO16-NEXT: fcvt h1, s1 +; CHECK-GI-NO16-NEXT: ucvtf s0, w0 +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI45_0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI45_0] ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -1050,11 +1158,13 @@ define half @ucvtf_f16_i64_7(i64 %long) { ; ; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_7: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: ucvtf s1, x0 -; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24 -; CHECK-GI-NO16-NEXT: fcvt h1, s1 +; CHECK-GI-NO16-NEXT: ucvtf s0, x0 +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI46_0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI46_0] ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -1088,11 +1198,13 @@ define half @ucvtf_f16_i64_15(i64 %long) { ; ; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_15: ; CHECK-GI-NO16: // %bb.0: -; CHECK-GI-NO16-NEXT: ucvtf s1, x0 -; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24 -; CHECK-GI-NO16-NEXT: fcvt h1, s1 +; CHECK-GI-NO16-NEXT: ucvtf s0, x0 +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI47_0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI47_0] ; CHECK-GI-NO16-NEXT: fcvt s1, h1 -; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1 ; CHECK-GI-NO16-NEXT: fcvt h0, s0 ; CHECK-GI-NO16-NEXT: ret ; @@ -1244,21 +1356,33 @@ define i64 @fcvtzs_sat_f64_i64_64(double %dbl) { } define i32 @fcvtzs_sat_f16_i32_7(half %dbl) { -; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_7: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzs w0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_7: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzs w0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_7: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI55_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI55_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzs w0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI55_0 @@ -1272,21 +1396,33 @@ define i32 @fcvtzs_sat_f16_i32_7(half %dbl) { } define i32 @fcvtzs_sat_f16_i32_15(half %dbl) { -; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_15: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzs w0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_15: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzs w0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_15: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI56_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI56_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzs w0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI56_0 @@ -1300,21 +1436,33 @@ define i32 @fcvtzs_sat_f16_i32_15(half %dbl) { } define i64 @fcvtzs_sat_f16_i64_7(half %dbl) { -; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_7: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzs x0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_7: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzs x0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_7: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI57_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI57_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzs x0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI57_0 @@ -1328,21 +1476,33 @@ define i64 @fcvtzs_sat_f16_i64_7(half %dbl) { } define i64 @fcvtzs_sat_f16_i64_15(half %dbl) { -; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_15: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzs x0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_15: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzs x0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_15: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI58_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI58_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzs x0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI58_0 @@ -1490,21 +1650,33 @@ define i64 @fcvtzu_sat_f64_i64_64(double %dbl) { } define i32 @fcvtzu_sat_f16_i32_7(half %dbl) { -; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_7: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzu w0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_7: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzu w0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_7: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI66_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI66_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzu w0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI66_0 @@ -1518,21 +1690,33 @@ define i32 @fcvtzu_sat_f16_i32_7(half %dbl) { } define i32 @fcvtzu_sat_f16_i32_15(half %dbl) { -; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_15: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzu w0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_15: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzu w0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_15: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI67_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI67_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzu w0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI67_0 @@ -1546,21 +1730,33 @@ define i32 @fcvtzu_sat_f16_i32_15(half %dbl) { } define i64 @fcvtzu_sat_f16_i64_7(half %dbl) { -; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_7: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzu x0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_7: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzu x0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_7: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_7: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI68_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI68_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzu x0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_7: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI68_0 @@ -1574,21 +1770,33 @@ define i64 @fcvtzu_sat_f16_i64_7(half %dbl) { } define i64 @fcvtzu_sat_f16_i64_15(half %dbl) { -; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_15: -; CHECK-NO16: // %bb.0: -; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fmul s0, s0, s1 -; CHECK-NO16-NEXT: fcvt h0, s0 -; CHECK-NO16-NEXT: fcvt s0, h0 -; CHECK-NO16-NEXT: fcvtzu x0, s0 -; CHECK-NO16-NEXT: ret +; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_15: +; CHECK-SD-NO16: // %bb.0: +; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fmul s0, s0, s1 +; CHECK-SD-NO16-NEXT: fcvt h0, s0 +; CHECK-SD-NO16-NEXT: fcvt s0, h0 +; CHECK-SD-NO16-NEXT: fcvtzu x0, s0 +; CHECK-SD-NO16-NEXT: ret ; ; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_15: ; CHECK-SD-FP16: // %bb.0: ; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15 ; CHECK-SD-FP16-NEXT: ret ; +; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_15: +; CHECK-GI-NO16: // %bb.0: +; CHECK-GI-NO16-NEXT: adrp x8, .LCPI69_0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI69_0] +; CHECK-GI-NO16-NEXT: fcvt s1, h1 +; CHECK-GI-NO16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NO16-NEXT: fcvt h0, s0 +; CHECK-GI-NO16-NEXT: fcvt s0, h0 +; CHECK-GI-NO16-NEXT: fcvtzu x0, s0 +; CHECK-GI-NO16-NEXT: ret +; ; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_15: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: adrp x8, .LCPI69_0 @@ -1603,3 +1811,4 @@ define i64 @fcvtzu_sat_f16_i64_15(half %dbl) { ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}} ; CHECK-FP16: {{.*}} +; CHECK-NO16: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/frem-power2.ll b/llvm/test/CodeGen/AArch64/frem-power2.ll index e1bc7426ad63e..98276b68481a1 100644 --- a/llvm/test/CodeGen/AArch64/frem-power2.ll +++ b/llvm/test/CodeGen/AArch64/frem-power2.ll @@ -100,8 +100,9 @@ define half @hrem2_nsz(half %x) { ; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: fmov h1, #2.00000000 ; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: fmov s1, #2.00000000 +; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll index 8e0328eaa2658..be07978cd8516 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll @@ -38,11 +38,17 @@ define half @add_v2HalfH(<2 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: add_v2HalfH: ; CHECK-GI-NOFP16: // %bb.0: +; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI1_0 ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-NOFP16-NEXT: fcvt s2, h0 +; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI1_0] +; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[1] +; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 ; CHECK-GI-NOFP16-NEXT: fcvt s0, h0 +; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 +; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1 +; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 ; CHECK-GI-NOFP16-NEXT: ret ; @@ -82,13 +88,19 @@ define half @add_v3HalfH(<3 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: add_v3HalfH: ; CHECK-GI-NOFP16: // %bb.0: +; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI2_0 ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: fcvt s2, h0 -; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[2] +; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI2_0] ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 +; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 +; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] +; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[2] +; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 +; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 ; CHECK-GI-NOFP16-NEXT: fcvt s0, h0 -; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1 +; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 +; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 ; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 ; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0 @@ -140,11 +152,17 @@ define half @add_HalfH(<4 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: add_HalfH: ; CHECK-GI-NOFP16: // %bb.0: +; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI3_0 ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: fcvt s2, h0 +; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI3_0] +; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 +; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 +; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] +; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 +; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1 +; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 ; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2] ; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3] ; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 @@ -232,10 +250,16 @@ define half @add_H(<8 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: add_H: ; CHECK-GI-NOFP16: // %bb.0: -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI4_0 ; CHECK-GI-NOFP16-NEXT: fcvt s2, h0 +; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI4_0] +; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 +; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 +; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] +; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 +; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1 +; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2 ; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2] ; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 ; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 @@ -424,10 +448,16 @@ define half @add_2H(<16 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: add_2H: ; CHECK-GI-NOFP16: // %bb.0: -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] +; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI7_0 ; CHECK-GI-NOFP16-NEXT: fcvt s3, h0 +; CHECK-GI-NOFP16-NEXT: ldr h2, [x8, :lo12:.LCPI7_0] +; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 +; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3 +; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1] +; CHECK-GI-NOFP16-NEXT: fcvt h2, s2 +; CHECK-GI-NOFP16-NEXT: fcvt s3, h3 ; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 -; CHECK-GI-NOFP16-NEXT: fadd s2, s3, s2 +; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3 ; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] ; CHECK-GI-NOFP16-NEXT: fcvt h2, s2 ; CHECK-GI-NOFP16-NEXT: fcvt s3, h3 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll index 716401e2ebafe..c10d6e94226f2 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll @@ -52,11 +52,17 @@ define half @mul_HalfH(<4 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: mul_HalfH: ; CHECK-GI-NOFP16: // %bb.0: +; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI1_0 ; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-NOFP16-NEXT: fcvt s2, h0 +; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI1_0] ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1 +; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2 +; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] +; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 +; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 +; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 +; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2 ; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2] ; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3] ; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 @@ -138,10 +144,16 @@ define half @mul_H(<8 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: mul_H: ; CHECK-GI-NOFP16: // %bb.0: -; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI2_0 ; CHECK-GI-NOFP16-NEXT: fcvt s2, h0 +; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI2_0] +; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 +; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2 +; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] +; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 +; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 ; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 -; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1 +; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2 ; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2] ; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 ; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 @@ -309,10 +321,16 @@ define half @mul_2H(<16 x half> %bin.rdx) { ; ; CHECK-GI-NOFP16-LABEL: mul_2H: ; CHECK-GI-NOFP16: // %bb.0: -; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1] +; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI5_0 ; CHECK-GI-NOFP16-NEXT: fcvt s3, h0 +; CHECK-GI-NOFP16-NEXT: ldr h2, [x8, :lo12:.LCPI5_0] ; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 -; CHECK-GI-NOFP16-NEXT: fmul s2, s3, s2 +; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3 +; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1] +; CHECK-GI-NOFP16-NEXT: fcvt h2, s2 +; CHECK-GI-NOFP16-NEXT: fcvt s3, h3 +; CHECK-GI-NOFP16-NEXT: fcvt s2, h2 +; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3 ; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2] ; CHECK-GI-NOFP16-NEXT: fcvt h2, s2 ; CHECK-GI-NOFP16-NEXT: fcvt s3, h3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll index 1b879a604d715..1aee6ab24eea0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -403,38 +403,40 @@ define half @v_neg_rcp_f16(half %x) { ; GFX6-IEEE-LABEL: v_neg_rcp_f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_neg_rcp_f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] @@ -458,38 +460,40 @@ define half @v_rcp_f16(half %x) { ; GFX6-IEEE-LABEL: v_rcp_f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] @@ -513,38 +517,40 @@ define half @v_rcp_f16_arcp(half %x) { ; GFX6-IEEE-LABEL: v_rcp_f16_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_f16_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] @@ -569,7 +575,9 @@ define half @v_rcp_f16_arcp_afn(half %x) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -592,38 +600,40 @@ define half @v_rcp_f16_ulp25(half %x) { ; GFX6-IEEE-LABEL: v_rcp_f16_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] @@ -1444,67 +1454,70 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_rcp_v2f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1513,27 +1526,30 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1545,23 +1561,26 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX8-FLUSH-LABEL: v_rcp_v2f16: ; GFX8-FLUSH: ; %bb.0: ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1575,27 +1594,30 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1606,24 +1628,26 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX9-FLUSH-LABEL: v_rcp_v2f16: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1 -; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1636,27 +1660,30 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -1669,21 +1696,24 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -1696,25 +1726,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2 -; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x @@ -1725,67 +1757,70 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_neg_rcp_v2f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1794,27 +1829,30 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 -; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -1826,23 +1864,26 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16: ; GFX8-FLUSH: ; %bb.0: ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -1856,27 +1897,30 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 -; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -1887,24 +1931,26 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v1, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, -v1 -; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -1917,27 +1963,30 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -1950,21 +1999,24 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -1977,25 +2029,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, -1.0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2 -; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x @@ -2010,32 +2064,33 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -2046,37 +2101,39 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v3, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -2086,27 +2143,30 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -2119,23 +2179,26 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX8-FLUSH: ; %bb.0: ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -2150,27 +2213,30 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -2182,24 +2248,26 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, 1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v2, v5 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, v2 -; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, v4 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2 ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -2211,29 +2279,32 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -2245,23 +2316,26 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -2272,30 +2346,30 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX11-LABEL: v_rcp_v2f16_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, 1.0 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1] ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_f32 v7, v7, v4, v4 -; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4 +; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4 +; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v7 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v5 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3 -; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3 ; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2312,32 +2386,33 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v4, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -2348,37 +2423,39 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v3, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, -1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -2388,27 +2465,30 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 -; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -2421,23 +2501,26 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX8-FLUSH: ; %bb.0: ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -2452,27 +2535,30 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 -; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -2484,24 +2570,26 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, -1.0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, -1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v2, v5 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, -v2 -; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, -v4 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2 ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -2513,29 +2601,32 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -2547,23 +2638,26 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -2574,30 +2668,30 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX11-LABEL: v_neg_rcp_v2f16_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, -1.0 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1] ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-NEXT: v_mov_b32_e32 v5, -1.0 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_f32 v7, v7, v4, -v4 -; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4 +; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4 +; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v7 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3 -; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3 ; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2610,67 +2704,70 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -2717,8 +2814,11 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2764,67 +2864,70 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -2833,27 +2936,30 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -2865,23 +2971,26 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX8-FLUSH-LABEL: v_rcp_v2f16_ulp25: ; GFX8-FLUSH: ; %bb.0: ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -2895,27 +3004,30 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5 -; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -2926,24 +3038,26 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX9-FLUSH-LABEL: v_rcp_v2f16_ulp25: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1 -; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -2956,27 +3070,30 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -2989,21 +3106,24 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -3016,25 +3136,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2 -; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x @@ -3911,38 +4033,40 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) { define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) { ; GFX6-IEEE-LABEL: s_rcp_f16: ; GFX6-IEEE: ; %bb.0: -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, 1.0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-IEEE-NEXT: ; return to shader part epilog ; ; GFX6-FLUSH-LABEL: s_rcp_f16: ; GFX6-FLUSH: ; %bb.0: -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 @@ -3975,38 +4099,40 @@ define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) { define amdgpu_ps i16 @s_neg_rcp_f16(i16 inreg %a.arg) { ; GFX6-IEEE-LABEL: s_neg_rcp_f16: ; GFX6-IEEE: ; %bb.0: -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, -1.0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-IEEE-NEXT: ; return to shader part epilog ; ; GFX6-FLUSH-LABEL: s_neg_rcp_f16: ; GFX6-FLUSH: ; %bb.0: -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, -1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 @@ -4040,20 +4166,21 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) { ; GFX6-IEEE-LABEL: s_rsq_f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-IEEE-NEXT: ; return to shader part epilog @@ -4061,23 +4188,24 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) { ; GFX6-FLUSH-LABEL: s_rsq_f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-FLUSH-NEXT: ; return to shader part epilog @@ -4113,35 +4241,36 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0 ; GFX6-IEEE-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[0:1], v1, v1, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v8, v4, v4 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v3, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v9, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v4, v9 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v9, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v8, -v5, v6, 1.0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v9 -; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], -1.0, v1, -1.0 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v8, v6, v6 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v4, v2, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v6, s[0:1], v1, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v5, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v9, v5, v5 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v5, v9 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v6, v8, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], v2, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7 ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[0:1] -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -4154,40 +4283,42 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0 ; GFX6-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4199,28 +4330,31 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX8-IEEE-NEXT: s_lshr_b32 s0, s0, 16 ; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, s0 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v10, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v8, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 @@ -4235,22 +4369,25 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX8-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v2, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 @@ -4265,22 +4402,25 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX9-IEEE-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, s0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0 -; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4 -; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0 -; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5 -; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0 -; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_fma_f32 v8, -v2, v7, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX9-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 +; GFX9-IEEE-NEXT: v_fma_f32 v8, -v3, v9, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v8, v8, v6, v9 +; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -4294,23 +4434,25 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0 -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v6, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -4324,23 +4466,25 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX10-IEEE-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, s1 -; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0] -; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2 -; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0] -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -4354,22 +4498,25 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, s1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -4383,27 +4530,29 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 ; GFX11-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v4, -1.0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2 -; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog @@ -4419,20 +4568,21 @@ define half @v_rsq_f16(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4440,23 +4590,24 @@ define half @v_rsq_f16(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4481,20 +4632,21 @@ define half @v_neg_rsq_f16(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4502,23 +4654,24 @@ define half @v_neg_rsq_f16(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4553,20 +4706,21 @@ define { half, half } @v_rsq_f16_multi_use(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v2, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4574,23 +4728,24 @@ define { half, half } @v_rsq_f16_multi_use(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4630,20 +4785,21 @@ define half @v_rsq_f16_missing_contract0(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4651,23 +4807,24 @@ define half @v_rsq_f16_missing_contract0(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4702,20 +4859,21 @@ define half @v_rsq_f16_missing_contract1(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4723,23 +4881,24 @@ define half @v_rsq_f16_missing_contract1(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4774,20 +4933,21 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4795,23 +4955,24 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4846,20 +5007,21 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4867,23 +5029,24 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4918,20 +5081,21 @@ define half @v_neg_rsq_f16_fabs(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -4939,23 +5103,24 @@ define half @v_neg_rsq_f16_fabs(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -4991,20 +5156,21 @@ define half @v_rsq_f16_arcp(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -5012,23 +5178,24 @@ define half @v_rsq_f16_arcp(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -5053,20 +5220,21 @@ define half @v_neg_rsq_f16_arcp(half %a) { ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -5074,23 +5242,24 @@ define half @v_neg_rsq_f16_arcp(half %a) { ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -5125,10 +5294,12 @@ define half @v_rsq_f16_afn(half %a) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -5153,10 +5324,12 @@ define half @v_rsq_f16_afn_nocontract(half %a) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -5192,35 +5365,36 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8 -; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], 1.0, v1, 1.0 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7 ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -5230,40 +5404,42 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -5272,28 +5448,31 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v7, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 @@ -5307,22 +5486,25 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 @@ -5336,22 +5518,25 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX9-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX9-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, v4 -; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, v5 -; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, 1.0 -; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, 1.0 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7 +; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 +; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -5364,23 +5549,25 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, v2 -; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -5393,23 +5580,25 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0] -; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, v2 -; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0] -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -5422,22 +5611,25 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -5450,7 +5642,7 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 @@ -5458,20 +5650,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2 -; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) @@ -5485,35 +5679,36 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8 -; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], -1.0, v1, -1.0 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7 ; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -5523,40 +5718,42 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -5565,28 +5762,31 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5 -; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5 -; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7 -; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3 -; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5 +; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 +; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 +; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 ; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 @@ -5600,22 +5800,25 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 -; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 -; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 -; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6 +; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5 +; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4 +; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 @@ -5629,22 +5832,25 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0 -; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0 -; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4 -; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5 -; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0 -; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7 +; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 +; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 ; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -5657,23 +5863,25 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2 -; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -5686,23 +5894,25 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0] -; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2 -; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0] -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -5715,22 +5925,25 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4 -; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5 -; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -5743,7 +5956,7 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, -1.0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 @@ -5751,20 +5964,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2 -; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0] -; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index 549af87c94949..302b2395642d0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -88,10 +88,11 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: v_or_b32_e32 v1, s4, v0 ; CI-NEXT: .LBB0_8: ; %Flow19 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 ; CI-NEXT: s_cselect_b32 s2, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v2 ; CI-NEXT: v_mov_b32_e32 v0, 0x7e00 ; CI-NEXT: s_and_b32 s2, 1, s2 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc @@ -1196,15 +1197,16 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_or_b32_e32 v1, s4, v1 ; CI-NEXT: .LBB9_16: ; %Flow54 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, 0 ; CI-NEXT: s_and_b32 s0, s0, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s0, 0x7c00 ; CI-NEXT: s_cselect_b32 s4, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 ; CI-NEXT: s_cselect_b32 s2, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v2 +; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v2, v3 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_mov_b32_e32 v2, 0x7e00 ; CI-NEXT: s_and_b32 s3, 1, s4 @@ -1728,25 +1730,26 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_or_b32_e32 v3, s1, v3 ; CI-NEXT: .LBB10_32: ; %Flow124 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v5, 0 ; CI-NEXT: s_and_b32 s1, s4, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s1, 0x7c00 ; CI-NEXT: s_cselect_b32 s11, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4 +; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 ; CI-NEXT: s_and_b32 s2, s6, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00 ; CI-NEXT: s_cselect_b32 s6, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v4 +; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v4, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s3 ; CI-NEXT: s_and_b32 s4, s5, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s4, 0x7c00 ; CI-NEXT: s_cselect_b32 s12, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0, v4 +; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], v4, v5 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s10 ; CI-NEXT: s_and_b32 s7, s7, 0x7fff ; CI-NEXT: s_cmpk_lg_i32 s7, 0x7c00 ; CI-NEXT: s_cselect_b32 s7, 1, 0 -; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], 0, v4 +; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], v4, v5 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_mov_b32_e32 v4, 0x7e00 ; CI-NEXT: s_and_b32 s10, 1, s11 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 9e152253bb6ca..9233f8059a202 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -7464,15 +7464,18 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 1.0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, 2.0 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_min_f32_e32 v2, 4.0, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 @@ -7636,24 +7639,27 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; SI-GISEL-NEXT: s_mov_b32 s10, 0 ; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 2.0 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v5, 4.0 ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-GISEL-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: buffer_load_ushort v6, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: buffer_load_ushort v7, v[0:1], s[8:11], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-GISEL-NEXT: v_add_f32_e32 v2, v4, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v7 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 @@ -8706,10 +8712,12 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 2.0 +; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 4.0 +; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -8788,15 +8796,17 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 2.0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0 -; SI-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0 +; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0 -; SI-GISEL-NEXT: v_min_f32_e32 v1, 4.0, v1 +; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index ac356fad5b2da..af79c911f29f9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -6011,7 +6011,8 @@ define half @v_exp_f16_fast(half %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 @@ -6511,9 +6512,10 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -6707,11 +6709,12 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index d12ebe49814d8..a99c1991a7909 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -6092,7 +6092,8 @@ define half @v_exp10_f16_fast(half %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 @@ -6593,9 +6594,10 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -6789,11 +6791,12 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) { ; SI-GISEL: ; %bb.0: ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1 -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index 259ee0b26d2d8..3f66c23e1a73b 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -488,11 +488,13 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float @@ -580,13 +582,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi ; GISEL-CI-NEXT: s_mov_b32 s7, 0xf000 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GISEL-CI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) -; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v1, v2, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index ba77552e5809b..21e6faf46f58d 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -313,11 +313,13 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float @@ -1007,26 +1009,28 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> @@ -1221,23 +1225,25 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4 ; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v8 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v2, v3, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 -; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v3 +; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v3 +; GISEL-CI-NEXT: v_min_f32_e32 v2, v2, v3 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-CI-NEXT: s_setpc_b64 s[30:31] @@ -1435,28 +1441,30 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v10 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v10 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v4, v11 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 -; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2 -; GISEL-CI-NEXT: v_max_f32_e32 v3, 0, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v3, v3, v2 +; GISEL-CI-NEXT: v_max_f32_e32 v2, v4, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0 -; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2 -; GISEL-CI-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v5 +; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v5 +; GISEL-CI-NEXT: v_min_f32_e32 v2, v3, v5 +; GISEL-CI-NEXT: v_min_f32_e32 v3, v4, v5 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -1614,14 +1622,16 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 @@ -1780,15 +1790,17 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0 ; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1 +; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1 +; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2 ; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index c90b2c9170414..4f73e8e9c1883 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -271,7 +271,8 @@ define half @v_maximumnum_f16_1.0(half %x) { ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_max_f32_e32 v0, 1.0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0 +; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index 64e8b7b50de08..558006d2b6957 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -271,7 +271,8 @@ define half @v_minimumnum_f16_1.0(half %x) { ; GFX7-GISEL: ; %bb.0: ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0 +; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ;