Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64][GlobalISel] Better vecreduce.fadd lowering. #73294

Closed
wants to merge 1 commit into from

Conversation

davemgreen
Copy link
Collaborator

This changes the fadd legalization to handle fp16 types, and treats more types as legal so that the backend can produce the correct patterns. This is currently a missing identity fold for fadd x, -0.0 -> x

@llvmbot
Copy link
Collaborator

llvmbot commented Nov 24, 2023

@llvm/pr-subscribers-backend-aarch64

@llvm/pr-subscribers-llvm-globalisel

Author: David Green (davemgreen)

Changes

This changes the fadd legalization to handle fp16 types, and treats more types as legal so that the backend can produce the correct patterns. This is currently a missing identity fold for fadd x, -0.0 -> x


Patch is 54.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73294.diff

3 Files Affected:

  • (modified) llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (+1)
  • (modified) llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (+11-3)
  • (modified) llvm/test/CodeGen/AArch64/vecreduce-fadd.ll (+800-436)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 11d01429485dcbc..f6834eb3d4502b0 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2819,6 +2819,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_VECREDUCE_FADD:
   case TargetOpcode::G_VECREDUCE_FMIN:
   case TargetOpcode::G_VECREDUCE_FMAX:
   case TargetOpcode::G_VECREDUCE_FMINIMUM:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 316a9eaa63d4bb4..e665bf42a98de8a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -970,11 +970,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalFor(PackedVectorAllTypeList)
       .lowerIf(isScalar(0));
 
+  // For fadd reductions we have pairwise operations available. We treat the
+  // usual legal types as legal and handle the lowering to pairwise instructions
+  // later.
   getActionDefinitionsBuilder(G_VECREDUCE_FADD)
-      // We only have FADDP to do reduction-like operations. Lower the rest.
-      .legalFor({{s32, v2s32}, {s64, v2s64}})
+      .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
+      .legalIf([=](const LegalityQuery &Query) {
+        const auto &Ty = Query.Types[1];
+        return (Ty == v4s16 || Ty == v8s16) && HasFP16;
+      })
+      .minScalarOrElt(0, MinFPScalar)
       .clampMaxNumElements(1, s64, 2)
-      .clampMaxNumElements(1, s32, 2)
+      .clampMaxNumElements(1, s32, 4)
+      .clampMaxNumElements(1, s16, 8)
       .lower();
 
   getActionDefinitionsBuilder(G_VECREDUCE_ADD)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index e770def93aa4e6c..43e44b6832f8c14 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -1,223 +1,346 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --mtriple=aarch64-eabi -aarch64-neon-syntax=generic -mattr=+fullfp16 < %s | FileCheck --check-prefixes=CHECK,FULLFP16 %s
-; RUN: llc --mtriple=aarch64-eabi -aarch64-neon-syntax=generic < %s | FileCheck %s --check-prefixes=CHECK,CHECKNOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @add_HalfS(<2 x float> %bin.rdx)  {
-; CHECK-LABEL: add_HalfS:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    faddp s0, v0.2s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_HalfS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    faddp s0, v0.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_HalfS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2s, #128, lsl #24
+; CHECK-GI-NEXT:    faddp s0, v0.2s
+; CHECK-GI-NEXT:    fadd s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx)
   ret float %r
 }
 
 define half @add_HalfH(<4 x half> %bin.rdx)  {
-; FULLFP16-LABEL: add_HalfH:
-; FULLFP16:       // %bb.0:
-; FULLFP16-NEXT:    faddp v0.4h, v0.4h, v0.4h
-; FULLFP16-NEXT:    faddp h0, v0.2h
-; FULLFP16-NEXT:    ret
+; CHECK-SD-NOFP16-LABEL: add_HalfH:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s2, s1
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
 ;
-; CHECKNOFP16-LABEL: add_HalfH:
-; CHECKNOFP16:       // %bb.0:
-; CHECKNOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECKNOFP16-NEXT:    mov h1, v0.h[1]
-; CHECKNOFP16-NEXT:    fcvt s2, h0
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s1, s2, s1
-; CHECKNOFP16-NEXT:    mov h2, v0.h[2]
-; CHECKNOFP16-NEXT:    mov h0, v0.h[3]
-; CHECKNOFP16-NEXT:    fcvt h1, s1
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fcvt s0, h0
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s1, s1, s2
-; CHECKNOFP16-NEXT:    fcvt h1, s1
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s0, s1, s0
-; CHECKNOFP16-NEXT:    fcvt h0, s0
-; CHECKNOFP16-NEXT:    ret
+; CHECK-SD-FP16-LABEL: add_HalfH:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    faddp v0.4h, v0.4h, v0.4h
+; CHECK-SD-FP16-NEXT:    faddp h0, v0.2h
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: add_HalfH:
+; CHECK-GI-NOFP16:       // %bb.0:
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
+; CHECK-GI-NOFP16-NEXT:    faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    faddp s0, v0.2s
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fadd s0, s0, s1
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: add_HalfH:
+; CHECK-GI-FP16:       // %bb.0:
+; CHECK-GI-FP16-NEXT:    faddp v0.4h, v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT:    adrp x8, .LCPI1_0
+; CHECK-GI-FP16-NEXT:    ldr h1, [x8, :lo12:.LCPI1_0]
+; CHECK-GI-FP16-NEXT:    faddp h0, v0.2h
+; CHECK-GI-FP16-NEXT:    fadd h0, h0, h1
+; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
   ret half %r
 }
 
 
 define half @add_H(<8 x half> %bin.rdx)  {
-; FULLFP16-LABEL: add_H:
-; FULLFP16:       // %bb.0:
-; FULLFP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
-; FULLFP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
-; FULLFP16-NEXT:    faddp h0, v0.2h
-; FULLFP16-NEXT:    ret
+; CHECK-SD-NOFP16-LABEL: add_H:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s2, s1
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: add_H:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
+; CHECK-SD-FP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
+; CHECK-SD-FP16-NEXT:    faddp h0, v0.2h
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: add_H:
+; CHECK-GI-NOFP16:       // %bb.0:
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    faddp s0, v0.2s
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fadd s0, s0, s1
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ret
 ;
-; CHECKNOFP16-LABEL: add_H:
-; CHECKNOFP16:       // %bb.0:
-; CHECKNOFP16-NEXT:    mov h1, v0.h[1]
-; CHECKNOFP16-NEXT:    fcvt s2, h0
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s1, s2, s1
-; CHECKNOFP16-NEXT:    mov h2, v0.h[2]
-; CHECKNOFP16-NEXT:    fcvt h1, s1
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s1, s1, s2
-; CHECKNOFP16-NEXT:    mov h2, v0.h[3]
-; CHECKNOFP16-NEXT:    fcvt h1, s1
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s1, s1, s2
-; CHECKNOFP16-NEXT:    mov h2, v0.h[4]
-; CHECKNOFP16-NEXT:    fcvt h1, s1
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s1, s1, s2
-; CHECKNOFP16-NEXT:    mov h2, v0.h[5]
-; CHECKNOFP16-NEXT:    fcvt h1, s1
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s1, s1, s2
-; CHECKNOFP16-NEXT:    mov h2, v0.h[6]
-; CHECKNOFP16-NEXT:    mov h0, v0.h[7]
-; CHECKNOFP16-NEXT:    fcvt h1, s1
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fcvt s0, h0
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s1, s1, s2
-; CHECKNOFP16-NEXT:    fcvt h1, s1
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s0, s1, s0
-; CHECKNOFP16-NEXT:    fcvt h0, s0
-; CHECKNOFP16-NEXT:    ret
+; CHECK-GI-FP16-LABEL: add_H:
+; CHECK-GI-FP16:       // %bb.0:
+; CHECK-GI-FP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    adrp x8, .LCPI2_0
+; CHECK-GI-FP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    ldr h1, [x8, :lo12:.LCPI2_0]
+; CHECK-GI-FP16-NEXT:    faddp h0, v0.2h
+; CHECK-GI-FP16-NEXT:    fadd h0, h0, h1
+; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx)
   ret half %r
 }
 
 define float @add_S(<4 x float> %bin.rdx)  {
-; CHECK-LABEL: add_S:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    faddp v0.4s, v0.4s, v0.4s
-; CHECK-NEXT:    faddp s0, v0.2s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_S:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    faddp v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT:    faddp s0, v0.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_S:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT:    movi v1.2s, #128, lsl #24
+; CHECK-GI-NEXT:    faddp s0, v0.2s
+; CHECK-GI-NEXT:    fadd s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx)
   ret float %r
 }
 
 define double @add_D(<2 x double> %bin.rdx)  {
-; CHECK-LABEL: add_D:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    faddp d0, v0.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: add_D:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    faddp d0, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: add_D:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    faddp d0, v0.2d
+; CHECK-GI-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
+; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    fadd d0, d0, d1
+; CHECK-GI-NEXT:    ret
   %r = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx)
   ret double %r
 }
 
 define half @add_2H(<16 x half> %bin.rdx)  {
-; FULLFP16-LABEL: add_2H:
-; FULLFP16:       // %bb.0:
-; FULLFP16-NEXT:    fadd v0.8h, v0.8h, v1.8h
-; FULLFP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
-; FULLFP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
-; FULLFP16-NEXT:    faddp h0, v0.2h
-; FULLFP16-NEXT:    ret
+; CHECK-SD-NOFP16-LABEL: add_2H:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fadd s4, s5, s4
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s3, s2
+; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s3, s5, s3
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s4, s2
+; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s4, s5, s4
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s3, s5, s3
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s4
+; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s4, s5, s4
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
+; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s1
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    fadd s3, s5, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
 ;
-; CHECKNOFP16-LABEL: add_2H:
-; CHECKNOFP16:       // %bb.0:
-; CHECKNOFP16-NEXT:    mov h2, v1.h[1]
-; CHECKNOFP16-NEXT:    mov h3, v0.h[1]
-; CHECKNOFP16-NEXT:    fcvt s4, h1
-; CHECKNOFP16-NEXT:    fcvt s5, h0
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fcvt s3, h3
-; CHECKNOFP16-NEXT:    fadd s4, s5, s4
-; CHECKNOFP16-NEXT:    mov h5, v0.h[2]
-; CHECKNOFP16-NEXT:    fadd s2, s3, s2
-; CHECKNOFP16-NEXT:    mov h3, v1.h[2]
-; CHECKNOFP16-NEXT:    fcvt h4, s4
-; CHECKNOFP16-NEXT:    fcvt s5, h5
-; CHECKNOFP16-NEXT:    fcvt h2, s2
-; CHECKNOFP16-NEXT:    fcvt s3, h3
-; CHECKNOFP16-NEXT:    fcvt s4, h4
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fadd s3, s5, s3
-; CHECKNOFP16-NEXT:    mov h5, v0.h[3]
-; CHECKNOFP16-NEXT:    fadd s2, s4, s2
-; CHECKNOFP16-NEXT:    mov h4, v1.h[3]
-; CHECKNOFP16-NEXT:    fcvt h3, s3
-; CHECKNOFP16-NEXT:    fcvt s5, h5
-; CHECKNOFP16-NEXT:    fcvt h2, s2
-; CHECKNOFP16-NEXT:    fcvt s4, h4
-; CHECKNOFP16-NEXT:    fcvt s3, h3
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fadd s4, s5, s4
-; CHECKNOFP16-NEXT:    mov h5, v0.h[4]
-; CHECKNOFP16-NEXT:    fadd s2, s2, s3
-; CHECKNOFP16-NEXT:    mov h3, v1.h[4]
-; CHECKNOFP16-NEXT:    fcvt h4, s4
-; CHECKNOFP16-NEXT:    fcvt s5, h5
-; CHECKNOFP16-NEXT:    fcvt h2, s2
-; CHECKNOFP16-NEXT:    fcvt s3, h3
-; CHECKNOFP16-NEXT:    fcvt s4, h4
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fadd s3, s5, s3
-; CHECKNOFP16-NEXT:    mov h5, v0.h[5]
-; CHECKNOFP16-NEXT:    fadd s2, s2, s4
-; CHECKNOFP16-NEXT:    mov h4, v1.h[5]
-; CHECKNOFP16-NEXT:    fcvt h3, s3
-; CHECKNOFP16-NEXT:    fcvt s5, h5
-; CHECKNOFP16-NEXT:    fcvt h2, s2
-; CHECKNOFP16-NEXT:    fcvt s4, h4
-; CHECKNOFP16-NEXT:    fcvt s3, h3
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fadd s4, s5, s4
-; CHECKNOFP16-NEXT:    mov h5, v0.h[6]
-; CHECKNOFP16-NEXT:    mov h0, v0.h[7]
-; CHECKNOFP16-NEXT:    fadd s2, s2, s3
-; CHECKNOFP16-NEXT:    fcvt h3, s4
-; CHECKNOFP16-NEXT:    mov h4, v1.h[6]
-; CHECKNOFP16-NEXT:    fcvt s5, h5
-; CHECKNOFP16-NEXT:    mov h1, v1.h[7]
-; CHECKNOFP16-NEXT:    fcvt s0, h0
-; CHECKNOFP16-NEXT:    fcvt h2, s2
-; CHECKNOFP16-NEXT:    fcvt s3, h3
-; CHECKNOFP16-NEXT:    fcvt s4, h4
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fadd s0, s0, s1
-; CHECKNOFP16-NEXT:    fadd s2, s2, s3
-; CHECKNOFP16-NEXT:    fadd s3, s5, s4
-; CHECKNOFP16-NEXT:    fcvt h0, s0
-; CHECKNOFP16-NEXT:    fcvt h2, s2
-; CHECKNOFP16-NEXT:    fcvt h3, s3
-; CHECKNOFP16-NEXT:    fcvt s0, h0
-; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fcvt s3, h3
-; CHECKNOFP16-NEXT:    fadd s2, s2, s3
-; CHECKNOFP16-NEXT:    fcvt h1, s2
-; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fadd s0, s1, s0
-; CHECKNOFP16-NEXT:    fcvt h0, s0
-; CHECKNOFP16-NEXT:    ret
+; CHECK-SD-FP16-LABEL: add_2H:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-SD-FP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
+; CHECK-SD-FP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
+; CHECK-SD-FP16-NEXT:    faddp h0, v0.2h
+; CHECK-SD-FP16-NEXT:    ret
+;
+; CHECK-GI-NOFP16-LABEL: add_2H:
+; CHECK-GI-NOFP16:       // %bb.0:
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v3.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fmov s1, w8
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    faddp s0, v0.2s
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT:    fadd s0, s0, s1
+; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT:    ret
+;
+; CHECK-GI-FP16-LABEL: add_2H:
+; CHECK-GI-FP16:       // %bb.0:
+; CHECK-GI-FP16-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT:    adrp x8, .LCPI5_0
+; CHECK-GI-FP16-NEXT:    faddp v1.8h, v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    faddp v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    ldr h1, [x8, :lo12:.LCPI5_0]
+; CHECK-GI-FP16-NEXT:    faddp h0, v0.2h
+; CHECK-GI-FP16-NEXT:    fadd h0, h0, h1
+; CHE...
[truncated]

This changes the fadd legalization to handle fp16 types, and treats more types
as legal so that the backend can produce the correct patterns. This is
currently a missing identity fold for `fadd x -0.0 -> x`
davemgreen added a commit that referenced this pull request Nov 27, 2023
This changes the fadd legalization to handle fp16 types, and treats more types
as legal so that the backend can produce the correct patterns. This is
currently a missing identity fold for `fadd x -0.0 -> x`
@davemgreen davemgreen closed this Nov 28, 2023
@davemgreen davemgreen deleted the gh-gi-vecreducefadd branch November 28, 2023 07:57
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

3 participants