612 changes: 611 additions & 1 deletion llvm/lib/CodeGen/CodeGenPrepare.cpp

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions llvm/lib/CodeGen/TargetLoweringBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -842,9 +842,10 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
initActions();

// Perform these initializations only once.
MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
= MaxStoresPerMemmoveOptSize = 4;
MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
MaxLoadsPerMemcmp = 8;
MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
UseUnderscoreSetJmp = false;
UseUnderscoreLongJmp = false;
HasMultipleConditionRegisters = false;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1041,6 +1041,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
MaxStoresPerMemset = 128;
MaxStoresPerMemcpy = 128;
MaxStoresPerMemmove = 128;
MaxLoadsPerMemcmp = 128;
} else {
MaxLoadsPerMemcmp = 8;
MaxLoadsPerMemcmpOptSize = 4;
}
}

Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
return LoopHasReductions;
}

bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
MaxLoadSize = 8;
return true;
}

bool PPCTTIImpl::enableInterleavedAccessVectorization() {
return true;
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
/// @{

bool enableAggressiveInterleaving(bool LoopHasReductions);
bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
bool enableInterleavedAccessVectorization();
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);
Expand Down
14 changes: 0 additions & 14 deletions llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,20 +85,6 @@ static bool isCallingConvCCompatible(CallInst *CI) {
return false;
}

/// Return true if it only matters that the value is equal or not-equal to zero.
static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
for (User *U : V->users()) {
if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
if (IC->isEquality())
if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
if (C->isNullValue())
continue;
// Unknown instruction.
return false;
}
return true;
}

/// Return true if it is only used in equality comparisons with With.
static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
for (User *U : V->users()) {
Expand Down
121 changes: 121 additions & 0 deletions llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-n32:64"
target triple = "powerpc64le-unknown-linux-gnu"

@zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4
@zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4
@zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4
@zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4
@zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4
@zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4
@zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4
@zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4

; Function Attrs: nounwind readonly
declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1

; Validate with if(memcmp())
; Function Attrs: nounwind readonly
define signext i32 @zeroEqualityTest01() local_unnamed_addr #0 {
entry:
%call = tail call signext i32 @memcmp(i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer1 to i8*), i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer2 to i8*), i64 16)
%not.tobool = icmp ne i32 %call, 0
%. = zext i1 %not.tobool to i32
ret i32 %.

; CHECK-LABEL: @zeroEqualityTest01
; CHECK-LABEL: %res_block
; CHECK: li 3, 1
; CHECK-NEXT: clrldi
; CHECK-NEXT: blr
; CHECK: li 3, 0
; CHECK-NEXT: clrldi
; CHECK-NEXT: blr
}

; Validate with if(memcmp() == 0)
; Function Attrs: nounwind readonly
define signext i32 @zeroEqualityTest02() local_unnamed_addr #0 {
entry:
%call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
%not.cmp = icmp ne i32 %call, 0
%. = zext i1 %not.cmp to i32
ret i32 %.

; CHECK-LABEL: @zeroEqualityTest02
; CHECK-LABEL: %res_block
; CHECK: li 3, 1
; CHECK-NEXT: clrldi
; CHECK-NEXT: blr
; CHECK: li 3, 0
; CHECK-NEXT: clrldi
; CHECK-NEXT: blr
}

; Validate with > 0
; Function Attrs: nounwind readonly
define signext i32 @zeroEqualityTest03() local_unnamed_addr #0 {
entry:
%call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
%not.cmp = icmp slt i32 %call, 1
%. = zext i1 %not.cmp to i32
ret i32 %.

; CHECK-LABEL: @zeroEqualityTest03
; CHECK-LABEL: %res_block
; CHECK: cmpld
; CHECK-NEXT: li [[LI:[0-9]+]], 1
; CHECK-NEXT: li [[LI2:[0-9]+]], -1
; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0
}

; Validate with < 0
; Function Attrs: nounwind readonly
define signext i32 @zeroEqualityTest04() local_unnamed_addr #0 {
entry:
%call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16)
%call.lobit = lshr i32 %call, 31
%call.lobit.not = xor i32 %call.lobit, 1
ret i32 %call.lobit.not

; CHECK-LABEL: @zeroEqualityTest04
; CHECK-LABEL: %res_block
; CHECK: cmpld
; CHECK-NEXT: li [[LI:[0-9]+]], 1
; CHECK-NEXT: li [[LI2:[0-9]+]], -1
; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0
}

; Validate with memcmp()?:
; Function Attrs: nounwind readonly
define signext i32 @zeroEqualityTest05() local_unnamed_addr #0 {
entry:
%call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
%not.tobool = icmp eq i32 %call, 0
%cond = zext i1 %not.tobool to i32
ret i32 %cond

; CHECK-LABEL: @zeroEqualityTest05
; CHECK-LABEL: %res_block
; CHECK: li 3, 1
; CHECK: li 3, 0
}

; Validate with !memcmp()?:
; Function Attrs: nounwind readonly
define signext i32 @zeroEqualityTest06() local_unnamed_addr #0 {
entry:
%call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
%not.lnot = icmp ne i32 %call, 0
%cond = zext i1 %not.lnot to i32
ret i32 %cond

; CHECK-LABEL: @zeroEqualityTest06
; CHECK-LABEL: %res_block
; CHECK: li 3, 1
; CHECK-NEXT: clrldi
; CHECK-NEXT: blr
; CHECK: li 3, 0
; CHECK-NEXT: clrldi
; CHECK-NEXT: blr
}
87 changes: 87 additions & 0 deletions llvm/test/CodeGen/PowerPC/memcmp.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s -check-prefix=CHECK

; Check size 8
; Function Attrs: nounwind readonly
define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
entry:
%0 = bitcast i32* %buffer1 to i8*
%1 = bitcast i32* %buffer2 to i8*
%call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 8) #2
ret i32 %call

; CHECK-LABEL: @test1
; CHECK: ldbrx [[LOAD1:[0-9]+]]
; CHECK-NEXT: ldbrx [[LOAD2:[0-9]+]]
; CHECK-NEXT: li [[LI:[0-9]+]], 1
; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
; CHECK-NEXT: li [[LI2:[0-9]+]], -1
; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
; CHECK-NEXT: extsw 3, [[ISEL2]]
; CHECK-NEXT: blr
}

; Check size 4
; Function Attrs: nounwind readonly
define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
entry:
%0 = bitcast i32* %buffer1 to i8*
%1 = bitcast i32* %buffer2 to i8*
%call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2
ret i32 %call

; CHECK-LABEL: @test2
; CHECK: lwbrx [[LOAD1:[0-9]+]]
; CHECK-NEXT: lwbrx [[LOAD2:[0-9]+]]
; CHECK-NEXT: li [[LI:[0-9]+]], 1
; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
; CHECK-NEXT: li [[LI2:[0-9]+]], -1
; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
; CHECK-NEXT: extsw 3, [[ISEL2]]
; CHECK-NEXT: blr
}

; Check size 2
; Function Attrs: nounwind readonly
define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
entry:
%0 = bitcast i32* %buffer1 to i8*
%1 = bitcast i32* %buffer2 to i8*
%call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 2) #2
ret i32 %call

; CHECK-LABEL: @test3
; CHECK: lhbrx [[LOAD1:[0-9]+]]
; CHECK-NEXT: lhbrx [[LOAD2:[0-9]+]]
; CHECK-NEXT: li [[LI:[0-9]+]], 1
; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
; CHECK-NEXT: li [[LI2:[0-9]+]], -1
; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
; CHECK-NEXT: extsw 3, [[ISEL2]]
; CHECK-NEXT: blr
}

; Check size 1
; Function Attrs: nounwind readonly
define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
entry:
%0 = bitcast i32* %buffer1 to i8*
%1 = bitcast i32* %buffer2 to i8*
%call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 1) #2
ret i32 %call

; CHECK-LABEL: @test4
; CHECK: lbz [[LOAD1:[0-9]+]]
; CHECK-NEXT: lbz [[LOAD2:[0-9]+]]
; CHECK-NEXT: subf [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
; CHECK-NEXT: extsw 3, [[SUB]]
; CHECK-NEXT: blr
}

; Function Attrs: nounwind readonly
declare signext i32 @memcmp(i8*, i8*, i64) #1
194 changes: 194 additions & 0 deletions llvm/test/CodeGen/PowerPC/memcmpIR.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s
; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE

define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
entry:
; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label

; CHECK-LABEL: res_block:{{.*}}
; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
; CHECK-NEXT: br label %endblock

; CHECK: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]]
; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]]
; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock


; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label

; CHECK-BE-LABEL: res_block:{{.*}}
; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
; CHECK-BE-NEXT: br label %endblock

; CHECK-BE: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]]
; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]]
; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock

%0 = bitcast i32* %buffer1 to i8*
%1 = bitcast i32* %buffer2 to i8*
%call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16)
ret i32 %call
}

declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1

define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock

; CHECK-LABEL: res_block:{{.*}}
; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
; CHECK-NEXT: br label %endblock

; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock

; CHECK-BE-LABEL: res_block:{{.*}}
; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
; CHECK-BE-NEXT: br label %endblock

entry:
%0 = bitcast i32* %buffer1 to i8*
%1 = bitcast i32* %buffer2 to i8*
%call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4)
ret i32 %call
}

define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {
; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label

; CHECK-LABEL: res_block:{{.*}}
; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
; CHECK-NEXT: br label %endblock

; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label

; CHECK: [[LOAD1:%[0-9]+]] = load i16, i16*
; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD1]])
; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD2]])
; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[BSWAP1]] to i64
; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[BSWAP2]] to i64
; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label

; CHECK: [[LOAD1:%[0-9]+]] = load i8, i8*
; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
; CHECK-NEXT: br label %endblock

; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label

; CHECK-BE-LABEL: res_block:{{.*}}
; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
; CHECK-BE-NEXT: br label %endblock

; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label

; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, i16*
; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64
; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64
; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label

; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, i8*
; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
; CHECK-BE-NEXT: br label %endblock

entry:
%0 = bitcast i32* %buffer1 to i8*
%1 = bitcast i32* %buffer2 to i8*
%call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
ret i32 %call
}
; CHECK: call = tail call signext i32 @memcmp
; CHECK-BE: call = tail call signext i32 @memcmp
define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) {

entry:
%0 = bitcast i32* %buffer1 to i8*
%1 = bitcast i32* %buffer2 to i8*
%call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65)
ret i32 %call
}

define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) {
; CHECK: call = tail call signext i32 @memcmp
; CHECK-BE: call = tail call signext i32 @memcmp
entry:
%0 = bitcast i32* %buffer1 to i8*
%1 = bitcast i32* %buffer2 to i8*
%conv = sext i32 %SIZE to i64
%call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv)
ret i32 %call
}