Skip to content

Commit

Permalink
Lowering x86 adds/addus/subs/subus intrinsics (clang)
Browse files Browse the repository at this point in the history
This is the patch that lowers x86 intrinsics to native IR
in order to enable optimizations.

Patch by tkrupa

Differential Revision: https://reviews.llvm.org/D44786

llvm-svn: 330323
  • Loading branch information
Alexander Ivchenko committed Apr 19, 2018
1 parent e8fed15 commit d96ddcc
Show file tree
Hide file tree
Showing 5 changed files with 656 additions and 90 deletions.
99 changes: 98 additions & 1 deletion clang/lib/CodeGen/CGBuiltin.cpp
Expand Up @@ -8449,6 +8449,76 @@ static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
}

// Emit addition or subtraction with saturation.
// Handles both signed and unsigned intrinsics.
static Value *EmitX86AddSubSatExpr(CodeGenFunction &CGF, const CallExpr *E,
SmallVectorImpl<Value *> &Ops,
bool IsAddition, bool Signed) {

// Collect vector elements and type data.
llvm::Type *ResultType = CGF.ConvertType(E->getType());
int NumElements = ResultType->getVectorNumElements();
Value *Res;
if (!IsAddition && !Signed) {
Value *ICmp = CGF.Builder.CreateICmp(ICmpInst::ICMP_UGT, Ops[0], Ops[1]);
Value *Select = CGF.Builder.CreateSelect(ICmp, Ops[0], Ops[1]);
Res = CGF.Builder.CreateSub(Select, Ops[1]);
} else {
unsigned EltSizeInBits = ResultType->getScalarSizeInBits();
llvm::Type *ExtElementType = EltSizeInBits == 8 ?
CGF.Builder.getInt16Ty() :
CGF.Builder.getInt32Ty();

// Extending vectors to next possible width to make space for possible
// overflow.
llvm::Type *ExtType = llvm::VectorType::get(ExtElementType, NumElements);
Value *VecA = Signed ? CGF.Builder.CreateSExt(Ops[0], ExtType)
: CGF.Builder.CreateZExt(Ops[0], ExtType);
Value *VecB = Signed ? CGF.Builder.CreateSExt(Ops[1], ExtType)
: CGF.Builder.CreateZExt(Ops[1], ExtType);

llvm::Value *ExtProduct = IsAddition ? CGF.Builder.CreateAdd(VecA, VecB)
: CGF.Builder.CreateSub(VecA, VecB);

// Create vector of the same type as expected result with max possible
// values and extend it to the same type as the product of the addition.
APInt SignedMaxValue =
llvm::APInt::getSignedMaxValue(EltSizeInBits);
Value *Max = Signed ? llvm::ConstantInt::get(ResultType, SignedMaxValue)
: llvm::Constant::getAllOnesValue(ResultType);
Value *ExtMaxVec = Signed ? CGF.Builder.CreateSExt(Max, ExtType)
: CGF.Builder.CreateZExt(Max, ExtType);
// In Product, replace all overflowed values with max values of non-extended
// type.
ICmpInst::Predicate Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
Value *Cmp = CGF.Builder.CreateICmp(Pred, ExtProduct,
ExtMaxVec); // 1 if no overflow.
Value *SaturatedProduct = CGF.Builder.CreateSelect(
Cmp, ExtProduct, ExtMaxVec); // If overflowed, copy from max values.

if (Signed) {
APInt SignedMinValue =
llvm::APInt::getSignedMinValue(EltSizeInBits);
Value *Min = llvm::ConstantInt::get(ResultType, SignedMinValue);
Value *ExtMinVec = CGF.Builder.CreateSExt(Min, ExtType);
Value *IsNegative =
CGF.Builder.CreateICmp(ICmpInst::ICMP_SLT, SaturatedProduct, ExtMinVec);
SaturatedProduct =
CGF.Builder.CreateSelect(IsNegative, ExtMinVec, SaturatedProduct);
}

Res = CGF.Builder.CreateTrunc(SaturatedProduct,
ResultType); // Trunc to ResultType.
}
if (E->getNumArgs() == 4) { // For masked intrinsics.
Value *VecSRC = Ops[2];
Value *Mask = Ops[3];
return EmitX86Select(CGF, Mask, Res, VecSRC);
}

return Res;
}

Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
Expand Down Expand Up @@ -9516,10 +9586,37 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Load->setVolatile(true);
return Load;
}
case X86::BI__builtin_ia32_paddusb512_mask:
case X86::BI__builtin_ia32_paddusw512_mask:
case X86::BI__builtin_ia32_paddusb256:
case X86::BI__builtin_ia32_paddusw256:
case X86::BI__builtin_ia32_paddusb128:
case X86::BI__builtin_ia32_paddusw128:
return EmitX86AddSubSatExpr(*this, E, Ops, true, false); // Add, unsigned.
case X86::BI__builtin_ia32_paddsb512_mask:
case X86::BI__builtin_ia32_paddsw512_mask:
case X86::BI__builtin_ia32_paddsb256:
case X86::BI__builtin_ia32_paddsw256:
case X86::BI__builtin_ia32_paddsb128:
case X86::BI__builtin_ia32_paddsw128:
return EmitX86AddSubSatExpr(*this, E, Ops, true, true); // Add, signed.
case X86::BI__builtin_ia32_psubusb512_mask:
case X86::BI__builtin_ia32_psubusw512_mask:
case X86::BI__builtin_ia32_psubusb256:
case X86::BI__builtin_ia32_psubusw256:
case X86::BI__builtin_ia32_psubusb128:
case X86::BI__builtin_ia32_psubusw128:
return EmitX86AddSubSatExpr(*this, E, Ops, false, false); // Sub, unsigned.
case X86::BI__builtin_ia32_psubsb512_mask:
case X86::BI__builtin_ia32_psubsw512_mask:
case X86::BI__builtin_ia32_psubsb256:
case X86::BI__builtin_ia32_psubsw256:
case X86::BI__builtin_ia32_psubsb128:
case X86::BI__builtin_ia32_psubsw128:
return EmitX86AddSubSatExpr(*this, E, Ops, false, true); // Sub, signed.
}
}


Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
const CallExpr *E) {
SmallVector<Value*, 4> Ops;
Expand Down
66 changes: 58 additions & 8 deletions clang/test/CodeGen/avx2-builtins.c
Expand Up @@ -56,25 +56,53 @@ __m256i test_mm256_add_epi64(__m256i a, __m256i b) {

__m256i test_mm256_adds_epi8(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_adds_epi8
// CHECK: call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
// CHECK-NOT: call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
// CHECK: sext <32 x i8> %{{.*}} to <32 x i16>
// CHECK: sext <32 x i8> %{{.*}} to <32 x i16>
// CHECK: add <32 x i16> %{{.*}}, %{{.*}}
// CHECK: icmp sle <32 x i16> %{{.*}}, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
// CHECK: icmp slt <32 x i16> %{{.*}}, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>, <32 x i16> %{{.*}}
// CHECK: trunc <32 x i16> %{{.*}} to <32 x i8>
return _mm256_adds_epi8(a, b);
}

__m256i test_mm256_adds_epi16(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_adds_epi16
// CHECK: call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
// CHECK-NOT: call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
// CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
// CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
// CHECK: add <16 x i32> %{{.*}}, %{{.*}}
// CHECK: icmp sle <16 x i32> %{{.*}}, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
// CHECK: icmp slt <16 x i32> %{{.*}}, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>, <16 x i32> %{{.*}}
// CHECK: trunc <16 x i32> %{{.*}} to <16 x i16>
return _mm256_adds_epi16(a, b);
}

__m256i test_mm256_adds_epu8(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_adds_epu8
// CHECK: call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
// CHECK-NOT: call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
// CHECK: zext <32 x i8> %{{.*}} to <32 x i16>
// CHECK: zext <32 x i8> %{{.*}} to <32 x i16>
// CHECK: add <32 x i16> %{{.*}}, %{{.*}}
// CHECK: icmp ule <32 x i16> %{{.*}}, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
// CHECK: trunc <32 x i16> %{{.*}} to <32 x i8>
return _mm256_adds_epu8(a, b);
}

__m256i test_mm256_adds_epu16(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_adds_epu16
// CHECK: call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
// CHECK-NOT: call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
// CHECK: zext <16 x i16> %{{.*}} to <16 x i32>
// CHECK: zext <16 x i16> %{{.*}} to <16 x i32>
// CHECK: add <16 x i32> %{{.*}}, %{{.*}}
// CHECK: icmp ule <16 x i32> %{{.*}}, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
// CHECK: trunc <16 x i32> %{{.*}} to <16 x i16>
return _mm256_adds_epu16(a, b);
}

Expand Down Expand Up @@ -1171,25 +1199,47 @@ __m256i test_mm256_sub_epi64(__m256i a, __m256i b) {

__m256i test_mm256_subs_epi8(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_subs_epi8
// CHECK: call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
// CHECK-NOT: call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
// CHECK: sext <32 x i8> %{{.*}} to <32 x i16>
// CHECK: sext <32 x i8> %{{.*}} to <32 x i16>
// CHECK: sub <32 x i16> %{{.*}}, %{{.*}}
// CHECK: icmp sle <32 x i16> %{{.*}}, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
// CHECK: icmp slt <32 x i16> %{{.*}}, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
// CHECK: select <32 x i1> %{{.*}}, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>, <32 x i16> %{{.*}}
// CHECK: trunc <32 x i16> %{{.*}} to <32 x i8>
return _mm256_subs_epi8(a, b);
}

__m256i test_mm256_subs_epi16(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_subs_epi16
// CHECK: call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
// CHECK-NOT: call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
// CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
// CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
// CHECK: sub <16 x i32> %{{.*}}, %{{.*}}
// CHECK: icmp sle <16 x i32> %{{.*}}, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
// CHECK: icmp slt <16 x i32> %{{.*}}, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
// CHECK: select <16 x i1> %{{.*}}, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>, <16 x i32> %{{.*}}
// CHECK: trunc <16 x i32> %{{.*}} to <16 x i16>
return _mm256_subs_epi16(a, b);
}

__m256i test_mm256_subs_epu8(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_subs_epu8
// CHECK: call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
// CHECK-NOT: call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
// CHECK: icmp ugt <32 x i8> {{.*}}, {{.*}}
// CHECK: select <32 x i1> {{.*}}, <32 x i8> {{.*}}, <32 x i8> {{.*}}
// CHECK: sub <32 x i8> {{.*}}, {{.*}}
return _mm256_subs_epu8(a, b);
}

__m256i test_mm256_subs_epu16(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_subs_epu16
// CHECK: call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
// CHECK-NOT: call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
// CHECK: icmp ugt <16 x i16> {{.*}}, {{.*}}
// CHECK: select <16 x i1> {{.*}}, <16 x i16> {{.*}}, <16 x i16> {{.*}}
// CHECK: sub <16 x i16> {{.*}}, {{.*}}
return _mm256_subs_epu16(a, b);
}

Expand Down

0 comments on commit d96ddcc

Please sign in to comment.