Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
//===- ARMCommonInstCombineIntrinsic.h -
// instCombineIntrinsic opts for both ARM and AArch64 -----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains optimizations for ARM and AArch64 intrinsics that
/// are shared between both architectures. These functions can be called from:
/// - ARM TTI's instCombineIntrinsic (for arm_neon_* intrinsics)
/// - AArch64 TTI's instCombineIntrinsic (for aarch64_neon_* and aarch64_sve_*
/// intrinsics)
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H
#define LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H

#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Value.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"

namespace llvm {

namespace ARMCommon {

/// Convert a table lookup to shufflevector if the mask is constant.
/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
/// which case we could lower the shufflevector with rev64 instructions
/// as it's actually a byte reverse.
Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC);

/// Simplify NEON multiply-long intrinsics (smull, umull).
/// These intrinsics perform widening multiplies: they multiply two vectors of
/// narrow integers and produce a vector of wider integers. This function
/// performs algebraic simplifications:
/// 1. Multiply by zero => zero vector
/// 2. Multiply by one => zero/sign-extend the non-one operand
/// 3. Both operands constant => regular multiply that can be constant-folded
/// later
Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC,
bool IsSigned);

/// Simplify AES encryption/decryption intrinsics (AESE, AESD).
///
/// ARM's AES instructions (AESE/AESD) XOR the data and the key, provided as
/// separate arguments, before performing the encryption/decryption operation.
/// We can fold that "internal" XOR with a previous one.
Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC);

} // namespace ARMCommon
} // namespace llvm

#endif // LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H
13 changes: 13 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/TargetParser/AArch64TargetParser.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
Expand Down Expand Up @@ -2856,6 +2857,18 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_neon_fmaxnm:
case Intrinsic::aarch64_neon_fminnm:
return instCombineMaxMinNM(IC, II);
case Intrinsic::aarch64_neon_tbl1:
return ARMCommon::simplifyNeonTbl1(II, IC);
case Intrinsic::aarch64_neon_smull:
case Intrinsic::aarch64_neon_umull: {
bool IsSigned = IID == Intrinsic::aarch64_neon_smull;
return ARMCommon::simplifyNeonMultiply(II, IC, IsSigned);
}
case Intrinsic::aarch64_crypto_aesd:
case Intrinsic::aarch64_crypto_aese:
case Intrinsic::aarch64_sve_aesd:
case Intrinsic::aarch64_sve_aese:
return ARMCommon::simplifyAES(II, IC);
case Intrinsic::aarch64_sve_convert_from_svbool:
return instCombineConvertFromSVBool(IC, II);
case Intrinsic::aarch64_sve_dup:
Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/TargetParser/SubtargetFeature.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
Expand Down Expand Up @@ -182,6 +183,19 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
break;
}

case Intrinsic::arm_neon_vtbl1:
return ARMCommon::simplifyNeonTbl1(II, IC);

case Intrinsic::arm_neon_vmulls:
case Intrinsic::arm_neon_vmullu: {
bool IsSigned = IID == Intrinsic::arm_neon_vmulls;
return ARMCommon::simplifyNeonMultiply(II, IC, IsSigned);
}

case Intrinsic::arm_neon_aesd:
case Intrinsic::arm_neon_aese:
return ARMCommon::simplifyAES(II, IC);

case Intrinsic::arm_mve_pred_i2v: {
Value *Arg = II.getArgOperand(0);
Value *ArgArg;
Expand Down
104 changes: 0 additions & 104 deletions llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -737,44 +737,6 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
return nullptr;
}

/// Convert a table lookup to shufflevector if the mask is constant.
/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
/// which case we could lower the shufflevector with rev64 instructions
/// as it's actually a byte reverse.
static Value *simplifyNeonTbl1(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
// Bail out if the mask is not a constant.
auto *C = dyn_cast<Constant>(II.getArgOperand(1));
if (!C)
return nullptr;

auto *VecTy = cast<FixedVectorType>(II.getType());
unsigned NumElts = VecTy->getNumElements();

// Only perform this transformation for <8 x i8> vector types.
if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
return nullptr;

int Indexes[8];

for (unsigned I = 0; I < NumElts; ++I) {
Constant *COp = C->getAggregateElement(I);

if (!COp || !isa<ConstantInt>(COp))
return nullptr;

Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();

// Make sure the mask indices are in range.
if ((unsigned)Indexes[I] >= NumElts)
return nullptr;
}

auto *V1 = II.getArgOperand(0);
auto *V2 = Constant::getNullValue(V1->getType());
return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
}

// Returns true iff the 2 intrinsics have the same operands, limiting the
// comparison to the first NumOperands.
static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
Expand Down Expand Up @@ -3155,72 +3117,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
Intrinsic::getOrInsertDeclaration(II->getModule(), NewIntrin);
return CallInst::Create(NewFn, CallArgs);
}
case Intrinsic::arm_neon_vtbl1:
case Intrinsic::aarch64_neon_tbl1:
if (Value *V = simplifyNeonTbl1(*II, Builder))
return replaceInstUsesWith(*II, V);
break;

case Intrinsic::arm_neon_vmulls:
case Intrinsic::arm_neon_vmullu:
case Intrinsic::aarch64_neon_smull:
case Intrinsic::aarch64_neon_umull: {
Value *Arg0 = II->getArgOperand(0);
Value *Arg1 = II->getArgOperand(1);

// Handle mul by zero first:
if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
}

// Check for constant LHS & RHS - in this case we just simplify.
bool Zext = (IID == Intrinsic::arm_neon_vmullu ||
IID == Intrinsic::aarch64_neon_umull);
VectorType *NewVT = cast<VectorType>(II->getType());
if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
Value *V0 = Builder.CreateIntCast(CV0, NewVT, /*isSigned=*/!Zext);
Value *V1 = Builder.CreateIntCast(CV1, NewVT, /*isSigned=*/!Zext);
return replaceInstUsesWith(CI, Builder.CreateMul(V0, V1));
}

// Couldn't simplify - canonicalize constant to the RHS.
std::swap(Arg0, Arg1);
}

// Handle mul by one:
if (Constant *CV1 = dyn_cast<Constant>(Arg1))
if (ConstantInt *Splat =
dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
if (Splat->isOne())
return CastInst::CreateIntegerCast(Arg0, II->getType(),
/*isSigned=*/!Zext);

break;
}
case Intrinsic::arm_neon_aesd:
case Intrinsic::arm_neon_aese:
case Intrinsic::aarch64_crypto_aesd:
case Intrinsic::aarch64_crypto_aese:
case Intrinsic::aarch64_sve_aesd:
case Intrinsic::aarch64_sve_aese: {
Value *DataArg = II->getArgOperand(0);
Value *KeyArg = II->getArgOperand(1);

// Accept zero on either operand.
if (!match(KeyArg, m_ZeroInt()))
std::swap(KeyArg, DataArg);

// Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
Value *Data, *Key;
if (match(KeyArg, m_ZeroInt()) &&
match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
replaceOperand(*II, 0, Data);
replaceOperand(*II, 1, Key);
return II;
}
break;
}
case Intrinsic::hexagon_V6_vandvrt:
case Intrinsic::hexagon_V6_vandvrt_128B: {
// Simplify Q -> V -> Q conversion.
Expand Down
136 changes: 136 additions & 0 deletions llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
//===- ARMCommonInstCombineIntrinsic.cpp -
// instCombineIntrinsic opts for both ARM and AArch64 ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains optimizations for ARM and AArch64 intrinsics that
/// are shared between both architectures. These functions can be called from:
/// - ARM TTI's instCombineIntrinsic (for arm_neon_* intrinsics)
/// - AArch64 TTI's instCombineIntrinsic (for aarch64_neon_* and aarch64_sve_*
/// intrinsics)
///
//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Value.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"

using namespace llvm;
using namespace llvm::PatternMatch;

namespace llvm {
namespace ARMCommon {

/// Convert a table lookup to shufflevector if the mask is constant.
/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
/// which case we could lower the shufflevector with rev64 instructions
/// as it's actually a byte reverse.
Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC) {
// Bail out if the mask is not a constant.
auto *C = dyn_cast<Constant>(II.getArgOperand(1));
if (!C)
return nullptr;

auto *VecTy = cast<FixedVectorType>(II.getType());
unsigned NumElts = VecTy->getNumElements();

// Only perform this transformation for <8 x i8> vector types.
if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
return nullptr;

int Indexes[8];

for (unsigned I = 0; I < NumElts; ++I) {
Constant *COp = C->getAggregateElement(I);

if (!COp || !isa<ConstantInt>(COp))
return nullptr;

Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();

// Make sure the mask indices are in range.
if ((unsigned)Indexes[I] >= NumElts)
return nullptr;
}

auto *V1 = II.getArgOperand(0);
auto *V2 = Constant::getNullValue(V1->getType());
Value *Shuf = IC.Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
return IC.replaceInstUsesWith(II, Shuf);
}

/// Simplify NEON multiply-long intrinsics (smull, umull).
/// These intrinsics perform widening multiplies: they multiply two vectors of
/// narrow integers and produce a vector of wider integers. This function
/// performs algebraic simplifications:
/// 1. Multiply by zero => zero vector
/// 2. Multiply by one => zero/sign-extend the non-one operand
/// 3. Both operands constant => regular multiply that can be constant-folded
/// later
Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC,
bool IsSigned) {
Value *Arg0 = II.getArgOperand(0);
Value *Arg1 = II.getArgOperand(1);

// Handle mul by zero first:
if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
return IC.replaceInstUsesWith(II, ConstantAggregateZero::get(II.getType()));
}

// Check for constant LHS & RHS - in this case we just simplify.
VectorType *NewVT = cast<VectorType>(II.getType());
if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
Value *V0 = IC.Builder.CreateIntCast(CV0, NewVT, IsSigned);
Value *V1 = IC.Builder.CreateIntCast(CV1, NewVT, IsSigned);
return IC.replaceInstUsesWith(II, IC.Builder.CreateMul(V0, V1));
}

// Couldn't simplify - canonicalize constant to the RHS.
std::swap(Arg0, Arg1);
}

// Handle mul by one:
if (Constant *CV1 = dyn_cast<Constant>(Arg1))
if (ConstantInt *Splat =
dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
if (Splat->isOne())
return CastInst::CreateIntegerCast(Arg0, II.getType(), IsSigned);

return nullptr;
}

/// Simplify AES encryption/decryption intrinsics (AESE, AESD).
///
/// ARM's AES instructions (AESE/AESD) XOR the data and the key, provided as
/// separate arguments, before performing the encryption/decryption operation.
/// We can fold that "internal" XOR with a previous one.
Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC) {
Value *DataArg = II.getArgOperand(0);
Value *KeyArg = II.getArgOperand(1);

// Accept zero on either operand.
if (!match(KeyArg, m_ZeroInt()))
std::swap(KeyArg, DataArg);

// Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
Value *Data, *Key;
if (match(KeyArg, m_ZeroInt()) &&
match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
IC.replaceOperand(II, 0, Data);
IC.replaceOperand(II, 1, Key);
return &II;
}

return nullptr;
}

} // namespace ARMCommon
} // namespace llvm
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Utils/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
add_llvm_component_library(LLVMTransformUtils
AddDiscriminators.cpp
AMDGPUEmitPrintf.cpp
ARMCommonInstCombineIntrinsic.cpp
ASanStackFrameLayout.cpp
AssumeBundleBuilder.cpp
BasicBlockUtils.cpp
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -passes=instcombine < %s | FileCheck %s
; RUN: opt --mtriple=aarch64 -S -passes=instcombine < %s | FileCheck %s
; ARM64 AES intrinsic variants

define <16 x i8> @combineXorAeseZeroARM64(<16 x i8> %data, <16 x i8> %key) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -passes=instcombine < %s | FileCheck %s
; RUN: opt -mtriple=arm -S -passes=instcombine < %s | FileCheck %s

define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp {
; CHECK-LABEL: define <4 x i32> @mulByZero(
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -passes=instcombine < %s | FileCheck %s
; RUN: opt -mtriple=arm -S -passes=instcombine < %s | FileCheck %s
; ARM AES intrinsic variants

define <16 x i8> @combineXorAeseZeroARM(<16 x i8> %data, <16 x i8> %key) {
Expand Down
Loading