From bbf2da8b8e581bb040c7bdef745667e338652119 Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8@gmail.com>
Date: Wed, 20 Aug 2025 15:28:42 +0300
Subject: [PATCH] [AArch64] Split zero cycle zeoring per register class

This change improves LLVM's model accuracy by splitting AArch64 subtarget features of zero cycle zeroing per register class. This aligns with how uarch is designed (each register bank has unique capabilities). Similarly to how we improved ZCM modeling.

It splits `HasZeroCycleZeroingGP` to `HasZeroCycleZeroingGPR32` and `HasZeroCycleZeroingGPR64`, removes opaque `FeatureZCZeroing`, and infers `FeatureNoZCZeroingFP` to be `FeatureNoZCZeroingFPR64` based on the single usage in `AArch64AsmPrinter.cpp`.

It also splits `arm64-zero-cycle-zeroing.ll` into 2 tests one `-gpr` and one `-fpr`, similarly to ZCM, to make the tests more focused and managable in correspondance with the new modeling.

The test cases are updated as well, exlpoiting the fact that this is a refactor patch:

- remove redundant functions that just mix isolated ones (t1-4)
- specialize check prefixes
- replace `apple-a10` with `apple-m1`
- add a `-mtriple=arm64-apple-macosx -mcpu=generic` test case for GPR
- isolate `mtriple=arm64-apple-ios -mcpu=cyclone` FP workaround test cas and move `-fullfp16` to another non-workaround test case
---
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp |   2 +-
 llvm/lib/Target/AArch64/AArch64Features.td    |  15 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |   4 +-
 llvm/lib/Target/AArch64/AArch64Processors.td  |  43 ++--
 .../AArch64/arm64-copy-phys-zero-reg.mir      |  12 +-
 .../AArch64/arm64-zero-cycle-zeroing-fpr.ll   | 153 ++++++++++++
 .../AArch64/arm64-zero-cycle-zeroing-gpr.ll   |  41 ++++
 .../AArch64/arm64-zero-cycle-zeroing.ll       | 231 ------------------
 llvm/test/CodeGen/AArch64/f16-imm.ll          |   4 +-
 9 files changed, 240 insertions(+), 265 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
 create mode 100644 llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll

diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index c52487ab8a79a..23f5331cea53c 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1829,7 +1829,7 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
 
 void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
   Register DestReg = MI.getOperand(0).getReg();
-  if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() &&
+  if (STI->hasZeroCycleZeroingFPR64() && !STI->hasZeroCycleZeroingFPWorkaround() &&
       STI->isNeonAvailable()) {
     // Convert H/S register to corresponding D register
     if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index c1c1f0a1024d0..4b75c928c5b54 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -627,19 +627,18 @@ def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFP
 def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
                                         "Has zero-cycle register moves for FPR32 registers">;
 
-def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
-                                        "Has zero-cycle zeroing instructions for generic registers">;
+def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGPR64", "true",
+                                        "Has zero-cycle zeroing instructions for GPR64 registers">;
+
+def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
+                                        "Has zero-cycle zeroing instructions for GPR32 registers">;
 
 // It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
 // as movi is more efficient across all cores. Newer cores can eliminate
 // fmovs early and there is no difference with movi, but this not true for
 // all implementations.
-def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false",
-                                        "Has no zero-cycle zeroing instructions for FP registers">;
-
-def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
-                                        "Has zero-cycle zeroing instructions",
-                                        [FeatureZCZeroingGP]>;
+def FeatureNoZCZeroingFPR64 : SubtargetFeature<"no-zcz-fpr64", "HasZeroCycleZeroingFPR64", "false",
+                                        "Has no zero-cycle zeroing instructions for FPR64 registers">;
 
 /// ... but the floating-point version doesn't quite work in rare cases on older
 /// CPUs.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index d15f90deba74e..bac19e2c77ce3 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5075,7 +5075,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
             .addImm(0)
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
       }
-    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
+    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) {
       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
           .addImm(0)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
@@ -5202,7 +5202,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
           .addReg(SrcReg, getKillRegState(KillSrc))
           .addImm(0)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
-    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
+    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) {
       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
           .addImm(0)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 42eaeca906e66..a924b6fea030a 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -321,7 +321,8 @@ def TuneAppleA7  : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
                                     FeatureFuseAES, FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing,
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64,
                                     FeatureZCZeroingFPWorkaround]>;
 
 def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
@@ -334,7 +335,8 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
                                     FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
                                     "Apple A11", [
@@ -346,7 +348,8 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
                                     FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
                                     "Apple A12", [
@@ -358,7 +361,8 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
                                     FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
                                     "Apple A13", [
@@ -370,7 +374,8 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
                                     FeatureFuseCryptoEOR,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     "Apple A14", [
@@ -387,7 +392,8 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
                                     "Apple A15", [
@@ -404,7 +410,8 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     "Apple A16", [
@@ -421,7 +428,8 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
                                     "Apple A17", [
@@ -438,7 +446,8 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
                                     FeatureFuseLiterals,
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
-                                    FeatureZCZeroing]>;
+                                    FeatureZCZeroingGPR32,
+                                    FeatureZCZeroingGPR64]>;
 
 def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
                                      "Apple M4", [
@@ -454,8 +463,8 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
                                      FeatureFuseCryptoEOR,
                                      FeatureFuseLiterals,
                                      FeatureZCRegMoveGPR64,
-                                     FeatureZCZeroing
-                                     ]>;
+                                     FeatureZCZeroingGPR32,
+                                     FeatureZCZeroingGPR64]>;
 
 def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                     "Samsung Exynos-M3 processors",
@@ -487,13 +496,15 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
                                      FeatureStorePairSuppress,
                                      FeatureALULSLFast,
                                      FeaturePostRAScheduler,
-                                     FeatureZCZeroing]>;
+                                     FeatureZCZeroingGPR32,
+                                     FeatureZCZeroingGPR64]>;
 
 def TuneKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    "Qualcomm Kryo processors", [
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing,
+                                   FeatureZCZeroingGPR32,
+                                   FeatureZCZeroingGPR64,
                                    FeatureALULSLFast,
                                    FeatureStorePairSuppress]>;
 
@@ -501,7 +512,8 @@ def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    "Qualcomm Falkor processors", [
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing,
+                                   FeatureZCZeroingGPR32,
+                                   FeatureZCZeroingGPR64,
                                    FeatureStorePairSuppress,
                                    FeatureALULSLFast,
                                    FeatureSlowSTRQro]>;
@@ -597,7 +609,8 @@ def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    "Qualcomm Saphira processors", [
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing,
+                                   FeatureZCZeroingGPR32,
+                                   FeatureZCZeroingGPR64,
                                    FeatureStorePairSuppress,
                                    FeatureALULSLFast]>;
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
index 76b5b76130657..284d624a4e68f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
+++ b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
@@ -1,15 +1,15 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz" %s \
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
 # RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz" %s \
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
 # RUN:    | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz" %s \
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
 # RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz" %s \
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
 # RUN:    | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz" %s \
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
 # RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz" %s \
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
 # RUN:    | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s
 
 --- |
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
new file mode 100644
index 0000000000000..2a75976d58549
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
@@ -0,0 +1,153 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-FULLFP16
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,FP-WORKAROUND
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
+
+define half @tf16() {
+entry:
+; ALL-LABEL: tf16:
+; FP-WORKAROUND: mov s0, wzr
+; NOZCZ-FPR64: mov s0, wzr
+; NOZCZ-FPR64-FULLFP16: mov h0, wzr
+; ZCZ-FPR64: movi d0, #0
+  ret half 0.0
+}
+
+define float @tf32() {
+entry:
+; ALL-LABEL: tf32:
+; FP-WORKAROUND: mov s0, wzr
+; NOZCZ-FPR64: mov s0, wzr
+; ZCZ-FPR64: movi d0, #0
+  ret float 0.0
+}
+
+define double @td64() {
+entry:
+; ALL-LABEL: td64:
+; FP-WORKAROUND: mov d0, xzr
+; NOZCZ-FPR64: mov d0, xzr
+; ZCZ-FPR64: movi d0, #0
+  ret double 0.0
+}
+
+define <8 x i8> @tv8i8() {
+entry:
+; ALL-LABEL: tv8i8:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+define <4 x i16> @tv4i16() {
+entry:
+; ALL-LABEL: tv4i16:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
+}
+
+define <2 x i32> @tv2i32() {
+entry:
+; ALL-LABEL: tv2i32:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <2 x i32> <i32 0, i32 0>
+}
+
+define <2 x float> @tv2f32() {
+entry:
+; ALL-LABEL: tv2f32:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <2 x float> <float 0.0, float 0.0>
+}
+
+define <16 x i8> @tv16i8() {
+entry:
+; ALL-LABEL: tv16i8:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+define <8 x i16> @tv8i16() {
+entry:
+; ALL-LABEL: tv8i16:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+}
+
+define <4 x i32> @tv4i32() {
+entry:
+; ALL-LABEL: tv4i32:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+}
+
+define <2 x i64> @tv2i64() {
+entry:
+; ALL-LABEL: tv2i64:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <2 x i64> <i64 0, i64 0>
+}
+
+define <4 x float> @tv4f32() {
+entry:
+; ALL-LABEL: tv4f32:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
+}
+
+define <2 x double> @tv2d64() {
+entry:
+; ALL-LABEL: tv2d64:
+; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
+; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
+  ret <2 x double> <double 0.0, double 0.0>
+}
+
+; We used to produce spills+reloads for a Q register with zero cycle zeroing
+; enabled.
+; ALL-LABEL: foo:
+; ALL-NOT: str q{{[0-9]+}}
+; ALL-NOT: ldr q{{[0-9]+}}
+define double @foo(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ]
+  %i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %conv21 = sitofp i32 %i.076 to double
+  %call = tail call fast double @sin(double %conv21)
+  %cmp.i = fcmp fast olt double %phi0, %call
+  %v0 = select i1 %cmp.i, double %call, double %phi0
+  %inc = add nuw nsw i32 %i.076, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret double %v0
+}
+
+declare double @sin(double)
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll
new file mode 100644
index 0000000000000..dc643062d8697
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr32 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr64 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR64
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR
+; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64
+
+define i8 @ti8() {
+entry:
+; ALL-LABEL: ti8:
+; NOZCZ-GPR: mov w0, wzr
+; ZCZ-GPR32: mov w0, #0
+  ret i8 0
+}
+
+define i16 @ti16() {
+entry:
+; ALL-LABEL: ti16:
+; NOZCZ-GPR: mov w0, wzr
+; ZCZ-GPR32: mov w0, #0
+  ret i16 0
+}
+
+define i32 @ti32() {
+entry:
+; ALL-LABEL: ti32:
+; NOZCZ-GPR: mov w0, wzr
+; ZCZ-GPR32: mov w0, #0
+  ret i32 0
+}
+
+define i64 @ti64() {
+entry:
+; ALL-LABEL: ti64:
+; NOZCZ-GPR: mov x0, xzr
+; ZCZ-GPR64: mov x0, #0
+  ret i64 0
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
deleted file mode 100644
index 6c3cd4766d799..0000000000000
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ /dev/null
@@ -1,231 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=-zcz-gp,+no-zcz-fp      | FileCheck %s -check-prefixes=ALL,NONEGP,NONEFP
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz                    | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz -mattr=+fullfp16   | FileCheck %s -check-prefixes=ALL,ZEROGP,ZERO16
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gp,+no-zcz-fp      | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP
-; RUN: llc < %s -mtriple=aarch64-linux-gnu                                | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP
-; RUN: llc < %s -mtriple=arm64-apple-ios   -mcpu=cyclone                  | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP
-; RUN: llc < %s -mtriple=arm64-linux-gnu   -mcpu=apple-a10                | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
-; RUN: llc < %s -mtriple=arm64-apple-ios   -mcpu=cyclone -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZEROGP,NONE16
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3                | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo                     | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor                   | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP
-
-declare void @bar(half, float, double, <2 x double>)
-declare void @bari(i32, i32)
-declare void @barl(i64, i64)
-declare void @barf(float, float)
-
-define void @t1() nounwind ssp {
-entry:
-; ALL-LABEL: t1:
-; ALL-NOT: fmov
-; NONEFP-DAG: fmov s0, wzr
-; NONEFP-DAG: fmov s1, wzr
-; NONEFP-DAG: fmov d2, xzr
-; NONEFP-DAG: movi{{(.16b)?}} v3{{(.2d)?}}, #0
-; NONE16: fmov h0, wzr
-; NONE16: fmov s1, wzr
-; NONE16: fmov d2, xzr
-; NONE16: movi{{(.16b)?}} v3{{(.2d)?}}, #0
-; ZEROFP-DAG: movi d0, #0
-; ZEROFP-DAG: movi d1, #0
-; ZEROFP-DAG: movi d2, #0
-; ZEROFP-DAG: movi v3.2d, #0
-; ZERO16: movi d0, #0
-; ZERO16: movi d1, #0
-; ZERO16: movi d2, #0
-; ZERO16: movi v3.2d, #0
-  tail call void @bar(half 0.000000e+00, float 0.000000e+00, double 0.000000e+00, <2 x double> <double 0.000000e+00, double 0.000000e+00>) nounwind
-  ret void
-}
-
-define void @t2() nounwind ssp {
-entry:
-; ALL-LABEL: t2:
-; NONEGP: mov w0, wzr
-; NONEGP: mov w1, wzr
-; ZEROGP: mov w0, #0
-; ZEROGP: mov w1, #0
-  tail call void @bari(i32 0, i32 0) nounwind
-  ret void
-}
-
-define void @t3() nounwind ssp {
-entry:
-; ALL-LABEL: t3:
-; NONEGP: mov x0, xzr
-; NONEGP: mov x1, xzr
-; ZEROGP: mov x0, #0
-; ZEROGP: mov x1, #0
-  tail call void @barl(i64 0, i64 0) nounwind
-  ret void
-}
-
-define void @t4() nounwind ssp {
-; ALL-LABEL: t4:
-; NONEFP: fmov s{{[0-3]+}}, wzr
-; NONEFP: fmov s{{[0-3]+}}, wzr
-; ZEROFP: movi d0, #0
-; ZEROFP: movi d1, #0
-  tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
-  ret void
-}
-
-declare double @sin(double)
-
-; We used to produce spills+reloads for a Q register with zero cycle zeroing
-; enabled.
-; ALL-LABEL: foo:
-; ALL-NOT: str q{{[0-9]+}}
-; ALL-NOT: ldr q{{[0-9]+}}
-define double @foo(i32 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ]
-  %i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %conv21 = sitofp i32 %i.076 to double
-  %call = tail call fast double @sin(double %conv21)
-  %cmp.i = fcmp fast olt double %phi0, %call
-  %v0 = select i1 %cmp.i, double %call, double %phi0
-  %inc = add nuw nsw i32 %i.076, 1
-  %cmp = icmp slt i32 %inc, %n
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:
-  ret double %v0
-}
-
-define <2 x i64> @t6() {
-; ALL-LABEL: t6:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <2 x i64> zeroinitializer
-}
-
-define i1 @ti1() {
-entry:
-; ALL-LABEL: ti1:
-; NONEGP: mov w0, wzr
-; ZEROGP: mov w0, #0
-  ret i1 false
-}
-
-define i8 @ti8() {
-entry:
-; ALL-LABEL: ti8:
-; NONEGP: mov w0, wzr
-; ZEROGP: mov w0, #0
-  ret i8 0
-}
-
-define i16 @ti16() {
-entry:
-; ALL-LABEL: ti16:
-; NONEGP: mov w0, wzr
- ; ZEROGP: mov w0, #0
-  ret i16 0
-}
-
-define i32 @ti32() {
-entry:
-; ALL-LABEL: ti32:
-; NONEGP: mov w0, wzr
-; ZEROGP: mov w0, #0
-  ret i32 0
-}
-
-define i64 @ti64() {
-entry:
-; ALL-LABEL: ti64:
-; NONEGP: mov x0, xzr
-; ZEROGP: mov x0, #0
-  ret i64 0
-}
-
-define float @tf32() {
-entry:
-; ALL-LABEL: tf32:
-; NONEFP: mov s0, wzr
-; ZEROFP: movi d0, #0
-  ret float 0.0
-}
-
-define double @td64() {
-entry:
-; ALL-LABEL: td64:
-; NONEFP: mov d0, xzr
-; ZEROFP: movi d0, #0
-  ret double 0.0
-}
-
-define <8 x i8> @tv8i8() {
-entry:
-; ALL-LABEL: tv8i8:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
-}
-
-define <4 x i16> @tv4i16() {
-entry:
-; ALL-LABEL: tv4i16:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
-}
-
-define <2 x i32> @tv2i32() {
-entry:
-; ALL-LABEL: tv2i32:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <2 x i32> <i32 0, i32 0>
-}
-
-define <2 x float> @tv2f32() {
-entry:
-; ALL-LABEL: tv2f32:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <2 x float> <float 0.0, float 0.0>
-}
-
-define <16 x i8> @tv16i8() {
-entry:
-; ALL-LABEL: tv16i8:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
-}
-
-define <8 x i16> @tv8i16() {
-entry:
-; ALL-LABEL: tv8i16:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-}
-
-define <4 x i32> @tv4i32() {
-entry:
-; ALL-LABEL: tv4i32:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-}
-
-define <2 x i64> @tv2i64() {
-entry:
-; ALL-LABEL: tv2i64:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <2 x i64> <i64 0, i64 0>
-}
-
-define <4 x float> @tv4f32() {
-entry:
-; ALL-LABEL: tv4f32:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
-}
-
-define <2 x double> @tv2d64() {
-entry:
-; ALL-LABEL: tv2d64:
-; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
-  ret <2 x double> <double 0.0, double 0.0>
-}
-
diff --git a/llvm/test/CodeGen/AArch64/f16-imm.ll b/llvm/test/CodeGen/AArch64/f16-imm.ll
index 58793bf19f3a6..68873f9b7c3de 100644
--- a/llvm/test/CodeGen/AArch64/f16-imm.ll
+++ b/llvm/test/CodeGen/AArch64/f16-imm.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+no-zcz-fp | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-NOZCZ
-; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+zcz | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-ZCZ
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+no-zcz-fpr64 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-NOZCZ
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+zcz-gpr32,+zcz-gpr64 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-ZCZ
 ; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK-NOFP16
 
 define half @Const0() {