From c90011dd900e5519da62cd2481d89b6001166f0d Mon Sep 17 00:00:00 2001 From: valadaptive Date: Wed, 26 Nov 2025 18:26:52 -0500 Subject: [PATCH 1/6] [AArch64][ARM] Add new tests for tbl/tbx optimizations --- .../Transforms/InstCombine/AArch64/tbl.ll | 272 ++++++++++++++++++ .../Transforms/InstCombine/AArch64/tbl1.ll | 65 ----- llvm/test/Transforms/InstCombine/ARM/tbl.ll | 218 ++++++++++++++ llvm/test/Transforms/InstCombine/ARM/tbl1.ll | 35 --- 4 files changed, 490 insertions(+), 100 deletions(-) create mode 100644 llvm/test/Transforms/InstCombine/AArch64/tbl.ll delete mode 100644 llvm/test/Transforms/InstCombine/AArch64/tbl1.ll create mode 100644 llvm/test/Transforms/InstCombine/ARM/tbl.ll delete mode 100644 llvm/test/Transforms/InstCombine/ARM/tbl1.ll diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll new file mode 100644 index 0000000000000..405b4c13700a4 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll @@ -0,0 +1,272 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +; We can turn a tbl/tbx intrinsic into a shufflevector instruction if the mask +; is constant and references 2 or fewer operands. + +; Basic tbl1 with all in-bounds indices should optimize to shufflevector. +define <16 x i8> @tbl1_basic(<16 x i8> %a) { +; CHECK-LABEL: @tbl1_basic( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl2 with both operands the same should optimize (1 unique source). +define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) { +; CHECK-LABEL: @tbl2_duplicate_operands( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[A]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl4 with alternating duplicate operands should optimize (2 unique sources). +define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @tbl4_duplicate_operands( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl4 where mask only references first two operands should optimize. +define <16 x i8> @tbl4_unused_operands(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { +; CHECK-LABEL: @tbl4_unused_operands( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl4 where mask only references one operand should optimize. +define <16 x i8> @tbl4_single_operand_used(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { +; CHECK-LABEL: @tbl4_single_operand_used( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources). +define <16 x i8> @tbl1_with_oob(<16 x i8> %a) { +; CHECK-LABEL: @tbl1_with_oob( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources). +define <16 x i8> @tbl2_duplicate_with_oob(<16 x i8> %a) { +; CHECK-LABEL: @tbl2_duplicate_with_oob( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[A]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources). +define <16 x i8> @tbl2_with_oob_bail(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @tbl2_with_oob_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl1 with all OOB indices should optimize to zero vector. +define <16 x i8> @tbl1_all_oob(<16 x i8> %a) { +; CHECK-LABEL: @tbl1_all_oob( +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> splat (i8 99)) +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl3 referencing all 3 operands should NOT optimize. +define <16 x i8> @tbl3_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: @tbl3_three_sources_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl4 referencing 3 unique operands should NOT optimize. +define <16 x i8> @tbl4_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; CHECK-LABEL: @tbl4_three_sources_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[A]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbl4 referencing all 4 unique operands should NOT optimize. +define <16 x i8> @tbl4_four_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { +; CHECK-LABEL: @tbl4_four_sources_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> ) + ret <16 x i8> %tbl +} + +; tbx1 with no OOB should optimize. +define <16 x i8> @tbx1_no_oob(<16 x i8> %fallback, <16 x i8> %a) { +; CHECK-LABEL: @tbx1_no_oob( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbx +} + +; tbx2 where fallback == second source operand should optimize (deduplicated). +define <16 x i8> @tbx2_fallback_equals_second_source(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @tbx2_fallback_equals_second_source( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> ) + ret <16 x i8> %tbx +} + +; tbx1 with OOB where fallback == source should optimize (deduplicated). +define <16 x i8> @tbx1_oob_fallback_same_as_source(<16 x i8> %a) { +; CHECK-LABEL: @tbx1_oob_fallback_same_as_source( +; CHECK-NEXT: [[A:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A1:%.*]], <16 x i8> [[A1]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[A]] +; + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbx +} + +; tbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources). +define <16 x i8> @tbx2_with_oob_bail(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @tbx2_with_oob_bail( +; CHECK-NEXT: [[TBX:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TBX]] +; + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b, <16 x i8> ) + ret <16 x i8> %tbx +} + +; tbx1 with all OOB indices should optimize to fallback. +define <16 x i8> @tbx1_all_oob(<16 x i8> %fallback, <16 x i8> %a) { +; CHECK-LABEL: @tbx1_all_oob( +; CHECK-NEXT: [[FALLBACK:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[FALLBACK1:%.*]], <16 x i8> [[A:%.*]], <16 x i8> splat (i8 99)) +; CHECK-NEXT: ret <16 x i8> [[FALLBACK]] +; + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbx +} + +; tbx1 with OOB and mismatched fallback/source sizes should NOT optimize. +define <8 x i8> @tbx1_fallback_size_mismatch(<8 x i8> %fallback, <16 x i8> %a) { +; CHECK-LABEL: @tbx1_fallback_size_mismatch( +; CHECK-NEXT: [[TBX:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TBX]] +; + %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbx +} + +; tbx1 with no OOB and mismatched fallback/source sizes should optimize. +define <8 x i8> @tbx1_fallback_size_mismatch_no_oob(<8 x i8> %fallback, <16 x i8> %a) { +; CHECK-LABEL: @tbx1_fallback_size_mismatch_no_oob( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbx +} + +; tbl1 with non-i8 element type should NOT optimize. +define <8 x i16> @tbl1_8x16(<16 x i8> %vec) { +; CHECK-LABEL: @tbl1_8x16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> ) +; CHECK-NEXT: ret <8 x i16> [[TBL1]] +; +entry: + %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> ) + ret <8 x i16> %tbl1 +} +declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>) + +; tbl1 with non-8/16 element count should NOT optimize. +define <12 x i8> @tbl1_16x8(<16 x i8> %vec) { +; CHECK-LABEL: @tbl1_16x8( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TBL1:%.*]] = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> [[VEC:%.*]], <12 x i8> ) +; CHECK-NEXT: ret <12 x i8> [[TBL1]] +; +entry: + %tbl1 = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> %vec, <12 x i8> ) + ret <12 x i8> %tbl1 +} +declare <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8>, <12 x i8>) + +; Non-constant mask should NOT optimize. +define <16 x i8> @tbl1_non_constant_mask(<16 x i8> %a, <16 x i8> %mask) { +; CHECK-LABEL: @tbl1_non_constant_mask( +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[MASK:%.*]]) +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %mask) + ret <16 x i8> %tbl +} + +; Mask with some poison elements should optimize, with poison propagating to output. +define <16 x i8> @tbl1_poison_mask_elements(<16 x i8> %a) { +; CHECK-LABEL: @tbl1_poison_mask_elements( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + +; Mask with all poison elements should optimize to poison. +define <16 x i8> @tbl1_all_poison_mask(<16 x i8> %a) { +; CHECK-LABEL: @tbl1_all_poison_mask( +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> poison) +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> poison) + ret <16 x i8> %tbl +} + +; "Real" declarations +declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone +declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll deleted file mode 100644 index 362cc0f6c4493..0000000000000 --- a/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll +++ /dev/null @@ -1,65 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=instcombine -S | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64" - -; Turning a table lookup intrinsic into a shuffle vector instruction -; can be beneficial. If the mask used for the lookup is the constant -; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64 -; instructions instead. - -define <8 x i8> @tbl1_8x8(<16 x i8> %vec) { -; CHECK-LABEL: @tbl1_8x8( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[VEC:%.*]], <16 x i8> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i8> [[TMP0]] -; -entry: - %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> ) - ret <8 x i8> %tbl1 -} - -; Bail the optimization if a mask index is out of range. -define <8 x i8> @tbl1_8x8_out_of_range(<16 x i8> %vec) { -; CHECK-LABEL: @tbl1_8x8_out_of_range( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TBL1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VEC:%.*]], <8 x i8> ) -; CHECK-NEXT: ret <8 x i8> [[TBL1]] -; -entry: - %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> ) - ret <8 x i8> %tbl1 -} - -; Bail the optimization if the size of the return vector is not 8 elements. -define <16 x i8> @tbl1_16x8(<16 x i8> %vec) { -; CHECK-LABEL: @tbl1_16x8( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TBL1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[VEC:%.*]], <16 x i8> ) -; CHECK-NEXT: ret <16 x i8> [[TBL1]] -; -entry: - %tbl1 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %vec, <16 x i8> ) - ret <16 x i8> %tbl1 -} - -; Bail the optimization if the elements of the return vector are not of type i8. -define <8 x i16> @tbl1_8x16(<16 x i8> %vec) { -; CHECK-LABEL: @tbl1_8x16( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> ) -; CHECK-NEXT: ret <8 x i16> [[TBL1]] -; -entry: - %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> ) - ret <8 x i16> %tbl1 -} - -; The type <8 x i16> is not a valid return type for this intrinsic, -; but we want to test that the optimization won't trigger for vector -; elements of type different than i8. -declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>) - -declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) -declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl.ll b/llvm/test/Transforms/InstCombine/ARM/tbl.ll new file mode 100644 index 0000000000000..1e736ec62c87e --- /dev/null +++ b/llvm/test/Transforms/InstCombine/ARM/tbl.ll @@ -0,0 +1,218 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv8-arm-none-eabi" + +; We can turn a vtbl/vtbx intrinsic into a shufflevector instruction if the mask +; is constant and references 2 or fewer operands. + +; Basic vtbl1 with all in-bounds indices should optimize to shufflevector. +define <8 x i8> @vtbl1_basic(<8 x i8> %a) { +; CHECK-LABEL: @vtbl1_basic( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl2 with both operands the same should be optimized (1 unique source). +define <8 x i8> @vtbl2_duplicate_operands(<8 x i8> %a) { +; CHECK-LABEL: @vtbl2_duplicate_operands( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[A]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl4 with alternating duplicate operands should optimize (2 unique sources). +define <8 x i8> @vtbl4_duplicate_operands(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @vtbl4_duplicate_operands( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl4 where mask only references first two operands should optimize. +define <8 x i8> @vtbl4_unused_operands(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { +; CHECK-LABEL: @vtbl4_unused_operands( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl4 where mask only references one operand should optimize. +define <8 x i8> @vtbl4_single_operand_used(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { +; CHECK-LABEL: @vtbl4_single_operand_used( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources). +define <8 x i8> @vtbl1_with_oob(<8 x i8> %a) { +; CHECK-LABEL: @vtbl1_with_oob( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources). +define <8 x i8> @vtbl2_duplicate_with_oob(<8 x i8> %a) { +; CHECK-LABEL: @vtbl2_duplicate_with_oob( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[A]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources). +define <8 x i8> @vtbl2_with_oob_bail(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @vtbl2_with_oob_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %b, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl1 with all OOB indices should optimize to zero vector. +define <8 x i8> @vtbl1_all_oob(<8 x i8> %a) { +; CHECK-LABEL: @vtbl1_all_oob( +; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> splat (i8 99)) +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl3 referencing all 3 operands should NOT optimize. +define <8 x i8> @vtbl3_three_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK-LABEL: @vtbl3_three_sources_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl4 referencing 3 unique operands should NOT optimize. +define <8 x i8> @vtbl4_three_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +; CHECK-LABEL: @vtbl4_three_sources_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[A]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbl4 referencing all 4 unique operands should NOT optimize. +define <8 x i8> @vtbl4_four_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { +; CHECK-LABEL: @vtbl4_four_sources_bail( +; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> ) + ret <8 x i8> %tbl +} + +; vtbx1 with no OOB should optimize. +define <8 x i8> @vtbx1_no_oob(<8 x i8> %fallback, <8 x i8> %a) { +; CHECK-LABEL: @vtbx1_no_oob( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[FALLBACK:%.*]], <8 x i8> [[A:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbx +} + +; vtbx2 where fallback == second source operand should optimize (deduplicated). +define <8 x i8> @vtbx2_fallback_equals_second_source(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @vtbx2_fallback_equals_second_source( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[B:%.*]], <8 x i8> [[A:%.*]], <8 x i8> [[B]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> ) + ret <8 x i8> %tbx +} + +; vtbx1 with OOB where fallback == source should optimize (deduplicated). +define <8 x i8> @vtbx1_oob_fallback_same_as_source(<8 x i8> %a) { +; CHECK-LABEL: @vtbx1_oob_fallback_same_as_source( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A:%.*]], <8 x i8> [[A]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbx +} + +; vtbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources). +define <8 x i8> @vtbx2_with_oob_bail(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @vtbx2_with_oob_bail( +; CHECK-NEXT: [[TBX:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[FALLBACK:%.*]], <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TBX]] +; + %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %b, <8 x i8> ) + ret <8 x i8> %tbx +} + +; vtbx1 with all OOB indices should optimize to fallback. +define <8 x i8> @vtbx1_all_oob(<8 x i8> %fallback, <8 x i8> %a) { +; CHECK-LABEL: @vtbx1_all_oob( +; CHECK-NEXT: [[FALLBACK:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[FALLBACK1:%.*]], <8 x i8> [[A:%.*]], <8 x i8> splat (i8 99)) +; CHECK-NEXT: ret <8 x i8> [[FALLBACK]] +; + %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbx +} + +; Non-constant mask should NOT optimize. +define <8 x i8> @vtbl1_non_constant_mask(<8 x i8> %a, <8 x i8> %mask) { +; CHECK-LABEL: @vtbl1_non_constant_mask( +; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> [[MASK:%.*]]) +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %mask) + ret <8 x i8> %tbl +} + +; Mask with some poison elements should optimize, with poison propagating to output. +define <8 x i8> @vtbl1_poison_mask_elements(<8 x i8> %a) { +; CHECK-LABEL: @vtbl1_poison_mask_elements( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> ) +; CHECK-NEXT: ret <8 x i8> [[TMP1]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + +; Mask with all poison elements should optimize to poison. +define <8 x i8> @vtbl1_all_poison_mask(<8 x i8> %a) { +; CHECK-LABEL: @vtbl1_all_poison_mask( +; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> poison) +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> poison) + ret <8 x i8> %tbl +} + +; Declarations +declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll b/llvm/test/Transforms/InstCombine/ARM/tbl1.ll deleted file mode 100644 index fbec1a2bb7a07..0000000000000 --- a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll +++ /dev/null @@ -1,35 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=instcombine -S | FileCheck %s - -target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" -target triple = "armv8-arm-none-eabi" - -; Turning a table lookup intrinsic into a shuffle vector instruction -; can be beneficial. If the mask used for the lookup is the constant -; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64 -; instructions instead. - -define <8 x i8> @tbl1_8x8(<8 x i8> %vec) { -; CHECK-LABEL: @tbl1_8x8( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC:%.*]], <8 x i8> poison, <8 x i32> -; CHECK-NEXT: ret <8 x i8> [[TMP0]] -; -entry: - %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> ) - ret <8 x i8> %vtbl1 -} - -; Bail the optimization if a mask index is out of range. -define <8 x i8> @tbl1_8x8_out_of_range(<8 x i8> %vec) { -; CHECK-LABEL: @tbl1_8x8_out_of_range( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[VTBL1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[VEC:%.*]], <8 x i8> ) -; CHECK-NEXT: ret <8 x i8> [[VTBL1]] -; -entry: - %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> ) - ret <8 x i8> %vtbl1 -} - -declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>) From cad833d169c329f8e80f8095ed608a4c7d1a304a Mon Sep 17 00:00:00 2001 From: valadaptive Date: Wed, 26 Nov 2025 18:28:24 -0500 Subject: [PATCH 2/6] [AArch64][ARM] Optimize more tbl/tbx calls into shufflevector --- .../InstCombine/InstCombineCalls.cpp | 139 +++++++++++++++--- .../Transforms/InstCombine/AArch64/tbl.ll | 33 ++--- llvm/test/Transforms/InstCombine/ARM/tbl.ll | 29 ++-- 3 files changed, 144 insertions(+), 57 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 743c4f574e131..d57b86bc40f46 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -737,42 +737,122 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) { return nullptr; } -/// Convert a table lookup to shufflevector if the mask is constant. -/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in -/// which case we could lower the shufflevector with rev64 instructions -/// as it's actually a byte reverse. -static Value *simplifyNeonTbl1(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { +/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and +/// at most two source operands are actually referenced. +static Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC, + bool IsExtension) { // Bail out if the mask is not a constant. - auto *C = dyn_cast(II.getArgOperand(1)); + auto *C = dyn_cast(II.getArgOperand(II.arg_size() - 1)); if (!C) return nullptr; - auto *VecTy = cast(II.getType()); - unsigned NumElts = VecTy->getNumElements(); + auto *RetTy = cast(II.getType()); + unsigned NumIndexes = RetTy->getNumElements(); - // Only perform this transformation for <8 x i8> vector types. - if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8) + // Only perform this transformation for <8 x i8> and <16 x i8> vector types. + if (!(RetTy->getElementType()->isIntegerTy(8) && + (NumIndexes == 8 || NumIndexes == 16))) return nullptr; - int Indexes[8]; + // For tbx instructions, the first argument is the "fallback" vector, which + // has the same length as the mask and return type. + unsigned int StartIndex = (unsigned)IsExtension; + auto *SourceTy = + cast(II.getArgOperand(StartIndex)->getType()); + // Note that the element count of each source vector does *not* need to be the + // same as the element count of the return type and mask! All source vectors + // must have the same element count as each other, though. + unsigned NumElementsPerSource = SourceTy->getNumElements(); + + // There are no tbl/tbx intrinsics for which the destination size exceeds the + // source size. However, our definitions of the intrinsics, at least in + // IntrinsicsAArch64.td, allow for arbitrary destination vector sizes, so it + // *could* technically happen. + if (NumIndexes > NumElementsPerSource) { + return nullptr; + } + + // The tbl/tbx intrinsics take several source operands followed by a mask + // operand. + unsigned int NumSourceOperands = II.arg_size() - 1 - (unsigned)IsExtension; - for (unsigned I = 0; I < NumElts; ++I) { + // Map input operands to shuffle indices. This also helpfully deduplicates the + // input arguments, in case the same value is passed as an argument multiple + // times. + SmallDenseMap ValueToShuffleSlot; + Value *ShuffleOperands[2] = {PoisonValue::get(SourceTy), + PoisonValue::get(SourceTy)}; + + int Indexes[16]; + for (unsigned I = 0; I < NumIndexes; ++I) { Constant *COp = C->getAggregateElement(I); - if (!COp || !isa(COp)) + if (!COp || (!isa(COp) && !isa(COp))) return nullptr; - Indexes[I] = cast(COp)->getLimitedValue(); + if (isa(COp)) { + Indexes[I] = -1; + continue; + } - // Make sure the mask indices are in range. - if ((unsigned)Indexes[I] >= NumElts) + uint64_t Index = cast(COp)->getZExtValue(); + // The index of the input argument that this index references (0 = first + // source argument, etc). + unsigned SourceOperandIndex = Index / NumElementsPerSource; + // The index of the element at that source operand. + unsigned SourceOperandElementIndex = Index % NumElementsPerSource; + + Value *SourceOperand; + if (SourceOperandIndex >= NumSourceOperands) { + // This index is out of bounds. Map it to index into either the fallback + // vector (tbx) or vector of zeroes (tbl). + SourceOperandIndex = NumSourceOperands; + if (IsExtension) { + // For out-of-bounds indices in tbx, choose the `I`th element of the + // fallback. + SourceOperand = II.getArgOperand(0); + SourceOperandElementIndex = I; + } else { + // Otherwise, choose some element from the dummy vector of zeroes (we'll + // always choose the first). + SourceOperand = Constant::getNullValue(SourceTy); + SourceOperandElementIndex = 0; + } + } else { + SourceOperand = II.getArgOperand(SourceOperandIndex + StartIndex); + } + + // The source operand may be the fallback vector, which may not have the + // same number of elements as the source vector. In that case, we *could* + // choose to extend its length with another shufflevector, but it's simpler + // to just bail instead. + if (cast(SourceOperand->getType())->getNumElements() != + NumElementsPerSource) { return nullptr; + } + + // We now know the source operand referenced by this index. Make it a + // shufflevector operand, if it isn't already. + unsigned NumSlots = ValueToShuffleSlot.size(); + // This shuffle references more than two sources, and hence cannot be + // represented as a shufflevector. + if (NumSlots == 2 && !ValueToShuffleSlot.contains(SourceOperand)) { + return nullptr; + } + auto [It, Inserted] = + ValueToShuffleSlot.try_emplace(SourceOperand, NumSlots); + if (Inserted) { + ShuffleOperands[It->getSecond()] = SourceOperand; + } + + unsigned RemappedIndex = + (It->getSecond() * NumElementsPerSource) + SourceOperandElementIndex; + Indexes[I] = RemappedIndex; } - auto *V1 = II.getArgOperand(0); - auto *V2 = Constant::getNullValue(V1->getType()); - return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes)); + Value *Shuf = IC.Builder.CreateShuffleVector( + ShuffleOperands[0], ShuffleOperands[1], ArrayRef(Indexes, NumIndexes)); + return IC.replaceInstUsesWith(II, Shuf); } // Returns true iff the 2 intrinsics have the same operands, limiting the @@ -3167,10 +3247,23 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { return CallInst::Create(NewFn, CallArgs); } case Intrinsic::arm_neon_vtbl1: + case Intrinsic::arm_neon_vtbl2: + case Intrinsic::arm_neon_vtbl3: + case Intrinsic::arm_neon_vtbl4: case Intrinsic::aarch64_neon_tbl1: - if (Value *V = simplifyNeonTbl1(*II, Builder)) - return replaceInstUsesWith(*II, V); - break; + case Intrinsic::aarch64_neon_tbl2: + case Intrinsic::aarch64_neon_tbl3: + case Intrinsic::aarch64_neon_tbl4: + return simplifyNeonTbl(*II, *this, /*IsExtension=*/false); + case Intrinsic::arm_neon_vtbx1: + case Intrinsic::arm_neon_vtbx2: + case Intrinsic::arm_neon_vtbx3: + case Intrinsic::arm_neon_vtbx4: + case Intrinsic::aarch64_neon_tbx1: + case Intrinsic::aarch64_neon_tbx2: + case Intrinsic::aarch64_neon_tbx3: + case Intrinsic::aarch64_neon_tbx4: + return simplifyNeonTbl(*II, *this, /*IsExtension=*/true); case Intrinsic::arm_neon_vmulls: case Intrinsic::arm_neon_vmullu: diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll index 405b4c13700a4..f747f44a7ab9f 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll @@ -10,7 +10,7 @@ target triple = "aarch64" ; Basic tbl1 with all in-bounds indices should optimize to shufflevector. define <16 x i8> @tbl1_basic(<16 x i8> %a) { ; CHECK-LABEL: @tbl1_basic( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] ; %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> ) @@ -20,7 +20,7 @@ define <16 x i8> @tbl1_basic(<16 x i8> %a) { ; tbl2 with both operands the same should optimize (1 unique source). define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) { ; CHECK-LABEL: @tbl2_duplicate_operands( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[A]], <16 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] ; %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) @@ -30,7 +30,7 @@ define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) { ; tbl4 with alternating duplicate operands should optimize (2 unique sources). define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: @tbl4_duplicate_operands( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] ; %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> ) @@ -40,7 +40,7 @@ define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) { ; tbl4 where mask only references first two operands should optimize. define <16 x i8> @tbl4_unused_operands(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; CHECK-LABEL: @tbl4_unused_operands( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] ; %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> ) @@ -50,7 +50,7 @@ define <16 x i8> @tbl4_unused_operands(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, ; tbl4 where mask only references one operand should optimize. define <16 x i8> @tbl4_single_operand_used(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; CHECK-LABEL: @tbl4_single_operand_used( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] ; %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> ) @@ -60,7 +60,7 @@ define <16 x i8> @tbl4_single_operand_used(<16 x i8> %a, <16 x i8> %b, <16 x i8> ; tbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources). define <16 x i8> @tbl1_with_oob(<16 x i8> %a) { ; CHECK-LABEL: @tbl1_with_oob( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> , <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] ; %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> ) @@ -70,7 +70,7 @@ define <16 x i8> @tbl1_with_oob(<16 x i8> %a) { ; tbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources). define <16 x i8> @tbl2_duplicate_with_oob(<16 x i8> %a) { ; CHECK-LABEL: @tbl2_duplicate_with_oob( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[A]], <16 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> , <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] ; %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) @@ -90,8 +90,7 @@ define <16 x i8> @tbl2_with_oob_bail(<16 x i8> %a, <16 x i8> %b) { ; tbl1 with all OOB indices should optimize to zero vector. define <16 x i8> @tbl1_all_oob(<16 x i8> %a) { ; CHECK-LABEL: @tbl1_all_oob( -; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> splat (i8 99)) -; CHECK-NEXT: ret <16 x i8> [[TBL]] +; CHECK-NEXT: ret <16 x i8> zeroinitializer ; %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> ) ret <16 x i8> %tbl @@ -130,7 +129,7 @@ define <16 x i8> @tbl4_four_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> % ; tbx1 with no OOB should optimize. define <16 x i8> @tbx1_no_oob(<16 x i8> %fallback, <16 x i8> %a) { ; CHECK-LABEL: @tbx1_no_oob( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] ; %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> ) @@ -140,7 +139,7 @@ define <16 x i8> @tbx1_no_oob(<16 x i8> %fallback, <16 x i8> %a) { ; tbx2 where fallback == second source operand should optimize (deduplicated). define <16 x i8> @tbx2_fallback_equals_second_source(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: @tbx2_fallback_equals_second_source( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B]], <16 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] ; %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> ) @@ -150,7 +149,7 @@ define <16 x i8> @tbx2_fallback_equals_second_source(<16 x i8> %a, <16 x i8> %b) ; tbx1 with OOB where fallback == source should optimize (deduplicated). define <16 x i8> @tbx1_oob_fallback_same_as_source(<16 x i8> %a) { ; CHECK-LABEL: @tbx1_oob_fallback_same_as_source( -; CHECK-NEXT: [[A:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A1:%.*]], <16 x i8> [[A1]], <16 x i8> ) +; CHECK-NEXT: [[A:%.*]] = shufflevector <16 x i8> [[A1:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[A]] ; %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) @@ -170,8 +169,7 @@ define <16 x i8> @tbx2_with_oob_bail(<16 x i8> %fallback, <16 x i8> %a, <16 x i8 ; tbx1 with all OOB indices should optimize to fallback. define <16 x i8> @tbx1_all_oob(<16 x i8> %fallback, <16 x i8> %a) { ; CHECK-LABEL: @tbx1_all_oob( -; CHECK-NEXT: [[FALLBACK:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[FALLBACK1:%.*]], <16 x i8> [[A:%.*]], <16 x i8> splat (i8 99)) -; CHECK-NEXT: ret <16 x i8> [[FALLBACK]] +; CHECK-NEXT: ret <16 x i8> [[FALLBACK:%.*]] ; %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> ) ret <16 x i8> %tbx @@ -190,7 +188,7 @@ define <8 x i8> @tbx1_fallback_size_mismatch(<8 x i8> %fallback, <16 x i8> %a) { ; tbx1 with no OOB and mismatched fallback/source sizes should optimize. define <8 x i8> @tbx1_fallback_size_mismatch_no_oob(<8 x i8> %fallback, <16 x i8> %a) { ; CHECK-LABEL: @tbx1_fallback_size_mismatch_no_oob( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i8> [[TMP1]] ; %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> ) @@ -236,7 +234,7 @@ define <16 x i8> @tbl1_non_constant_mask(<16 x i8> %a, <16 x i8> %mask) { ; Mask with some poison elements should optimize, with poison propagating to output. define <16 x i8> @tbl1_poison_mask_elements(<16 x i8> %a) { ; CHECK-LABEL: @tbl1_poison_mask_elements( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[TMP1]] ; %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> ) @@ -246,8 +244,7 @@ define <16 x i8> @tbl1_poison_mask_elements(<16 x i8> %a) { ; Mask with all poison elements should optimize to poison. define <16 x i8> @tbl1_all_poison_mask(<16 x i8> %a) { ; CHECK-LABEL: @tbl1_all_poison_mask( -; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> poison) -; CHECK-NEXT: ret <16 x i8> [[TBL]] +; CHECK-NEXT: ret <16 x i8> poison ; %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> poison) ret <16 x i8> %tbl diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl.ll b/llvm/test/Transforms/InstCombine/ARM/tbl.ll index 1e736ec62c87e..1aad566675867 100644 --- a/llvm/test/Transforms/InstCombine/ARM/tbl.ll +++ b/llvm/test/Transforms/InstCombine/ARM/tbl.ll @@ -20,7 +20,7 @@ define <8 x i8> @vtbl1_basic(<8 x i8> %a) { ; vtbl2 with both operands the same should be optimized (1 unique source). define <8 x i8> @vtbl2_duplicate_operands(<8 x i8> %a) { ; CHECK-LABEL: @vtbl2_duplicate_operands( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[A]], <8 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i8> [[TMP1]] ; %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> ) @@ -30,7 +30,7 @@ define <8 x i8> @vtbl2_duplicate_operands(<8 x i8> %a) { ; vtbl4 with alternating duplicate operands should optimize (2 unique sources). define <8 x i8> @vtbl4_duplicate_operands(<8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: @vtbl4_duplicate_operands( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> ; CHECK-NEXT: ret <8 x i8> [[TMP1]] ; %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> ) @@ -40,7 +40,7 @@ define <8 x i8> @vtbl4_duplicate_operands(<8 x i8> %a, <8 x i8> %b) { ; vtbl4 where mask only references first two operands should optimize. define <8 x i8> @vtbl4_unused_operands(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; CHECK-LABEL: @vtbl4_unused_operands( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> ; CHECK-NEXT: ret <8 x i8> [[TMP1]] ; %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> ) @@ -50,7 +50,7 @@ define <8 x i8> @vtbl4_unused_operands(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 ; vtbl4 where mask only references one operand should optimize. define <8 x i8> @vtbl4_single_operand_used(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; CHECK-LABEL: @vtbl4_single_operand_used( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i8> [[TMP1]] ; %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> ) @@ -60,7 +60,7 @@ define <8 x i8> @vtbl4_single_operand_used(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c ; vtbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources). define <8 x i8> @vtbl1_with_oob(<8 x i8> %a) { ; CHECK-LABEL: @vtbl1_with_oob( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> , <8 x i32> ; CHECK-NEXT: ret <8 x i8> [[TMP1]] ; %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> ) @@ -70,7 +70,7 @@ define <8 x i8> @vtbl1_with_oob(<8 x i8> %a) { ; vtbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources). define <8 x i8> @vtbl2_duplicate_with_oob(<8 x i8> %a) { ; CHECK-LABEL: @vtbl2_duplicate_with_oob( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[A]], <8 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> , <8 x i32> ; CHECK-NEXT: ret <8 x i8> [[TMP1]] ; %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> ) @@ -90,8 +90,7 @@ define <8 x i8> @vtbl2_with_oob_bail(<8 x i8> %a, <8 x i8> %b) { ; vtbl1 with all OOB indices should optimize to zero vector. define <8 x i8> @vtbl1_all_oob(<8 x i8> %a) { ; CHECK-LABEL: @vtbl1_all_oob( -; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> splat (i8 99)) -; CHECK-NEXT: ret <8 x i8> [[TBL]] +; CHECK-NEXT: ret <8 x i8> zeroinitializer ; %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> ) ret <8 x i8> %tbl @@ -130,7 +129,7 @@ define <8 x i8> @vtbl4_four_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, ; vtbx1 with no OOB should optimize. define <8 x i8> @vtbx1_no_oob(<8 x i8> %fallback, <8 x i8> %a) { ; CHECK-LABEL: @vtbx1_no_oob( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[FALLBACK:%.*]], <8 x i8> [[A:%.*]], <8 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i8> [[TMP1]] ; %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> ) @@ -140,7 +139,7 @@ define <8 x i8> @vtbx1_no_oob(<8 x i8> %fallback, <8 x i8> %a) { ; vtbx2 where fallback == second source operand should optimize (deduplicated). define <8 x i8> @vtbx2_fallback_equals_second_source(<8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: @vtbx2_fallback_equals_second_source( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[B:%.*]], <8 x i8> [[A:%.*]], <8 x i8> [[B]], <8 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> ; CHECK-NEXT: ret <8 x i8> [[TMP1]] ; %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> ) @@ -150,7 +149,7 @@ define <8 x i8> @vtbx2_fallback_equals_second_source(<8 x i8> %a, <8 x i8> %b) { ; vtbx1 with OOB where fallback == source should optimize (deduplicated). define <8 x i8> @vtbx1_oob_fallback_same_as_source(<8 x i8> %a) { ; CHECK-LABEL: @vtbx1_oob_fallback_same_as_source( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A:%.*]], <8 x i8> [[A]], <8 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i8> [[TMP1]] ; %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %a, <8 x i8> ) @@ -170,8 +169,7 @@ define <8 x i8> @vtbx2_with_oob_bail(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> % ; vtbx1 with all OOB indices should optimize to fallback. define <8 x i8> @vtbx1_all_oob(<8 x i8> %fallback, <8 x i8> %a) { ; CHECK-LABEL: @vtbx1_all_oob( -; CHECK-NEXT: [[FALLBACK:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[FALLBACK1:%.*]], <8 x i8> [[A:%.*]], <8 x i8> splat (i8 99)) -; CHECK-NEXT: ret <8 x i8> [[FALLBACK]] +; CHECK-NEXT: ret <8 x i8> [[FALLBACK:%.*]] ; %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> ) ret <8 x i8> %tbx @@ -190,7 +188,7 @@ define <8 x i8> @vtbl1_non_constant_mask(<8 x i8> %a, <8 x i8> %mask) { ; Mask with some poison elements should optimize, with poison propagating to output. define <8 x i8> @vtbl1_poison_mask_elements(<8 x i8> %a) { ; CHECK-LABEL: @vtbl1_poison_mask_elements( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i8> [[TMP1]] ; %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> ) @@ -200,8 +198,7 @@ define <8 x i8> @vtbl1_poison_mask_elements(<8 x i8> %a) { ; Mask with all poison elements should optimize to poison. define <8 x i8> @vtbl1_all_poison_mask(<8 x i8> %a) { ; CHECK-LABEL: @vtbl1_all_poison_mask( -; CHECK-NEXT: [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> poison) -; CHECK-NEXT: ret <8 x i8> [[TBL]] +; CHECK-NEXT: ret <8 x i8> poison ; %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> poison) ret <8 x i8> %tbl From 744bc2302d89e0393577d8af3a8b3140b7084717 Mon Sep 17 00:00:00 2001 From: valadaptive Date: Thu, 4 Dec 2025 13:29:34 -0500 Subject: [PATCH 3/6] Code style tweaks --- .../Transforms/InstCombine/InstCombineCalls.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index d57b86bc40f46..6e43f76e8d175 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -750,8 +750,8 @@ static Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC, unsigned NumIndexes = RetTy->getNumElements(); // Only perform this transformation for <8 x i8> and <16 x i8> vector types. - if (!(RetTy->getElementType()->isIntegerTy(8) && - (NumIndexes == 8 || NumIndexes == 16))) + if (!RetTy->getElementType()->isIntegerTy(8) || + (NumIndexes != 8 && NumIndexes != 16)) return nullptr; // For tbx instructions, the first argument is the "fallback" vector, which @@ -768,9 +768,8 @@ static Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC, // source size. However, our definitions of the intrinsics, at least in // IntrinsicsAArch64.td, allow for arbitrary destination vector sizes, so it // *could* technically happen. - if (NumIndexes > NumElementsPerSource) { + if (NumIndexes > NumElementsPerSource) return nullptr; - } // The tbl/tbx intrinsics take several source operands followed by a mask // operand. @@ -827,23 +826,21 @@ static Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC, // choose to extend its length with another shufflevector, but it's simpler // to just bail instead. if (cast(SourceOperand->getType())->getNumElements() != - NumElementsPerSource) { + NumElementsPerSource) return nullptr; - } // We now know the source operand referenced by this index. Make it a // shufflevector operand, if it isn't already. unsigned NumSlots = ValueToShuffleSlot.size(); // This shuffle references more than two sources, and hence cannot be // represented as a shufflevector. - if (NumSlots == 2 && !ValueToShuffleSlot.contains(SourceOperand)) { + if (NumSlots == 2 && !ValueToShuffleSlot.contains(SourceOperand)) return nullptr; - } + auto [It, Inserted] = ValueToShuffleSlot.try_emplace(SourceOperand, NumSlots); - if (Inserted) { + if (Inserted) ShuffleOperands[It->getSecond()] = SourceOperand; - } unsigned RemappedIndex = (It->getSecond() * NumElementsPerSource) + SourceOperandElementIndex; From 030e0d8f7db11eb34dc2870779afc868ab730464 Mon Sep 17 00:00:00 2001 From: valadaptive Date: Thu, 4 Dec 2025 13:34:55 -0500 Subject: [PATCH 4/6] Add should-optimize tbl3 test --- llvm/test/Transforms/InstCombine/AArch64/tbl.ll | 10 ++++++++++ llvm/test/Transforms/InstCombine/ARM/tbl.ll | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll index f747f44a7ab9f..cbc18d6414039 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll @@ -27,6 +27,16 @@ define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) { ret <16 x i8> %tbl } +; tbl3 referencing 2 unique operands should optimize. +define <16 x i8> @tbl3_two_sources(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: @tbl3_two_sources( +; CHECK-NEXT: [[TBL:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TBL]] +; + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> ) + ret <16 x i8> %tbl +} + ; tbl4 with alternating duplicate operands should optimize (2 unique sources). define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: @tbl4_duplicate_operands( diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl.ll b/llvm/test/Transforms/InstCombine/ARM/tbl.ll index 1aad566675867..a6d304c8735eb 100644 --- a/llvm/test/Transforms/InstCombine/ARM/tbl.ll +++ b/llvm/test/Transforms/InstCombine/ARM/tbl.ll @@ -27,6 +27,16 @@ define <8 x i8> @vtbl2_duplicate_operands(<8 x i8> %a) { ret <8 x i8> %tbl } +; vtbl3 referencing 2 unique operands should optimize. +define <8 x i8> @vtbl3_two_sources(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: @vtbl3_two_sources( +; CHECK-NEXT: [[TBL:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TBL]] +; + %tbl = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %a, <8 x i8> ) + ret <8 x i8> %tbl +} + ; vtbl4 with alternating duplicate operands should optimize (2 unique sources). define <8 x i8> @vtbl4_duplicate_operands(<8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: @vtbl4_duplicate_operands( From 09da6c30c9b0e2d2c25cafcc75555b28e31ca088 Mon Sep 17 00:00:00 2001 From: valadaptive Date: Thu, 4 Dec 2025 13:35:28 -0500 Subject: [PATCH 5/6] Remove declarations from tests --- .../test/Transforms/InstCombine/AArch64/tbl.ll | 18 ------------------ llvm/test/Transforms/InstCombine/ARM/tbl.ll | 10 ---------- 2 files changed, 28 deletions(-) diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll index cbc18d6414039..653a0eb26a68d 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll @@ -259,21 +259,3 @@ define <16 x i8> @tbl1_all_poison_mask(<16 x i8> %a) { %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> poison) ret <16 x i8> %tbl } - -; "Real" declarations -declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone -declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone -declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone -declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone -declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone -declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone -declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone -declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone -declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl.ll b/llvm/test/Transforms/InstCombine/ARM/tbl.ll index a6d304c8735eb..d4d5ec284d0b7 100644 --- a/llvm/test/Transforms/InstCombine/ARM/tbl.ll +++ b/llvm/test/Transforms/InstCombine/ARM/tbl.ll @@ -213,13 +213,3 @@ define <8 x i8> @vtbl1_all_poison_mask(<8 x i8> %a) { %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> poison) ret <8 x i8> %tbl } - -; Declarations -declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>) nounwind readnone -declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone -declare <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone -declare <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone -declare <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone -declare <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone -declare <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone -declare <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone From 9bafba0282f4830b65fb1e09565618c76c4443f1 Mon Sep 17 00:00:00 2001 From: valadaptive Date: Thu, 4 Dec 2025 13:40:57 -0500 Subject: [PATCH 6/6] Remove more declarations --- llvm/test/Transforms/InstCombine/AArch64/tbl.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll index 653a0eb26a68d..8a9ca6ce635a3 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll @@ -213,10 +213,10 @@ define <8 x i16> @tbl1_8x16(<16 x i8> %vec) { ; CHECK-NEXT: ret <8 x i16> [[TBL1]] ; entry: + ; `tbl1.v8i16` is not really a thing, but it's good to check. %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> ) ret <8 x i16> %tbl1 } -declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>) ; tbl1 with non-8/16 element count should NOT optimize. define <12 x i8> @tbl1_16x8(<16 x i8> %vec) { @@ -226,10 +226,10 @@ define <12 x i8> @tbl1_16x8(<16 x i8> %vec) { ; CHECK-NEXT: ret <12 x i8> [[TBL1]] ; entry: + ; `tbl1.v12i8` is not really a thing, but it's good to check. %tbl1 = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> %vec, <12 x i8> ) ret <12 x i8> %tbl1 } -declare <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8>, <12 x i8>) ; Non-constant mask should NOT optimize. define <16 x i8> @tbl1_non_constant_mask(<16 x i8> %a, <16 x i8> %mask) {