[CGP] Add generic TargetLowering::shouldAlignPointerArgs() implementa…

…tion This function was added for ARM targets, but aligning global/stack pointer arguments passed to memcpy/memmove/memset can improve code size and performance for all targets that don't have fast unaligned accesses. This adds a generic implementation that adjusts the alignment to pointer size if unaligned accesses are slow. Review D134168 suggests that this significantly improves performance on synthetic benchmarks such as Dhrystone on RV32 as it avoids memcpy() calls. Reviewed By: efriedma Differential Revision: https://reviews.llvm.org/D134282
llvm · Feb 9, 2023 · bd87a24 · bd87a24
1 parent f28c28e
commit bd87a24
Show file tree

Hide file tree

Showing 10 changed files with 138 additions and 59 deletions.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1932,10 +1932,10 @@ class TargetLoweringBase {
   /// the object whose address is being passed. If so then MinSize is set to the
   /// minimum size the object must be to be aligned and PrefAlign is set to the
   /// preferred alignment.
-  virtual bool shouldAlignPointerArgs(CallInst * /*CI*/, unsigned & /*MinSize*/,
-                                      Align & /*PrefAlign*/) const {
-    return false;
-  }
+  virtual bool
+  shouldUpdatePointerArgAlignment(const CallInst *CI, unsigned &MinSize,
+                                  Align &PrefAlign,
+                                  const TargetTransformInfo &TTI) const;
 
   //===--------------------------------------------------------------------===//
   /// \name Helpers for TargetTransformInfo implementations

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2221,10 +2221,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
   }
 
   // Align the pointer arguments to this call if the target thinks it's a good
-  // idea
+  // idea (generally only useful for memcpy/memmove/memset).
   unsigned MinSize;
   Align PrefAlign;
-  if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
+  if (TLI->shouldUpdatePointerArgAlignment(CI, MinSize, PrefAlign, *TTI)) {
     for (auto &Arg : CI->args()) {
       // We want to align both objects whose address is used directly and
       // objects whose address is used in casts and GEPs, though it only makes

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
@@ -948,6 +949,42 @@ bool TargetLoweringBase::isFreeAddrSpaceCast(unsigned SrcAS,
   return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
 }
 
+bool TargetLoweringBase::shouldUpdatePointerArgAlignment(
+    const CallInst *CI, unsigned &MinSize, Align &PrefAlign,
+    const TargetTransformInfo &TTI) const {
+  // For now, we only adjust alignment for memcpy/memmove/memset calls.
+  auto *MemCI = dyn_cast<MemIntrinsic>(CI);
+  if (!MemCI)
+    return false;
+  auto AddrSpace = MemCI->getDestAddressSpace();
+  // We assume that scalar register sized values can be loaded/stored
+  // efficiently. If this is not the case for a given target it should override
+  // this function.
+  auto PrefSizeBits =
+      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedSize();
+  PrefAlign = Align(PrefSizeBits / 8);
+  // When building with -Oz, we only increase the alignment if the object is
+  // at least 8 bytes in size to avoid increased stack/global padding.
+  // Otherwise, we require at least PrefAlign bytes to be copied.
+  MinSize = PrefAlign.value();
+  if (CI->getFunction()->hasMinSize())
+    MinSize = std::max(MinSize, 8u);
+
+  // XXX: we could determine the MachineMemOperand flags instead of assuming
+  // load+store (but it probably makes no difference for supported targets).
+  unsigned FastUnalignedAccess = 0;
+  if (allowsMisalignedMemoryAccesses(
+          LLT::scalar(PrefSizeBits), AddrSpace, Align(1),
+          MachineMemOperand::MOStore | MachineMemOperand::MOLoad,
+          &FastUnalignedAccess) &&
+      FastUnalignedAccess) {
+    // If unaligned loads&stores are fast, there is no need to adjust
+    // alignment.
+    return false;
+  }
+  return true; // unaligned accesses are not possible or slow.
+}
+
 void TargetLoweringBase::setJumpIsExpensive(bool isExpensive) {
   // If the command-line option was specified, ignore this request.
   if (!JumpIsExpensiveOverride.getNumOccurrences())

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1920,8 +1920,9 @@ ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
 // source/dest is aligned and the copy size is large enough. We therefore want
 // to align such objects passed to memory intrinsics.
-bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
-                                               Align &PrefAlign) const {
+bool ARMTargetLowering::shouldUpdatePointerArgAlignment(
+    const CallInst *CI, unsigned &MinSize, Align &PrefAlign,
+    const TargetTransformInfo &TTI) const {
   if (!isa<MemIntrinsic>(CI))
     return false;
   MinSize = 8;

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -572,8 +572,9 @@ class VectorType;
     const TargetRegisterClass *
     getRegClassFor(MVT VT, bool isDivergent = false) const override;
 
-    bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
-                                Align &PrefAlign) const override;
+    bool shouldUpdatePointerArgAlignment(
+        const CallInst *CI, unsigned &MinSize, Align &PrefAlign,
+        const TargetTransformInfo &TTI) const override;
 
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.

diff --git a/llvm/test/CodeGen/RISCV/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/memcpy-inline.ll
@@ -295,50 +295,35 @@ entry:
 }
 
 define void @t6() nounwind {
-; RV32ALIGNED-LABEL: t6:
-; RV32ALIGNED:       # %bb.0: # %entry
-; RV32ALIGNED-NEXT:    addi sp, sp, -16
-; RV32ALIGNED-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32ALIGNED-NEXT:    lui a0, %hi(spool.splbuf)
-; RV32ALIGNED-NEXT:    addi a0, a0, %lo(spool.splbuf)
-; RV32ALIGNED-NEXT:    lui a1, %hi(.L.str6)
-; RV32ALIGNED-NEXT:    addi a1, a1, %lo(.L.str6)
-; RV32ALIGNED-NEXT:    li a2, 14
-; RV32ALIGNED-NEXT:    call memcpy@plt
-; RV32ALIGNED-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32ALIGNED-NEXT:    addi sp, sp, 16
-; RV32ALIGNED-NEXT:    ret
+; RV32-LABEL: t6:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lui a0, %hi(spool.splbuf)
+; RV32-NEXT:    li a1, 88
+; RV32-NEXT:    sh a1, %lo(spool.splbuf+12)(a0)
+; RV32-NEXT:    lui a1, 361862
+; RV32-NEXT:    addi a1, a1, -1960
+; RV32-NEXT:    sw a1, %lo(spool.splbuf+8)(a0)
+; RV32-NEXT:    lui a1, 362199
+; RV32-NEXT:    addi a1, a1, 559
+; RV32-NEXT:    sw a1, %lo(spool.splbuf+4)(a0)
+; RV32-NEXT:    lui a1, 460503
+; RV32-NEXT:    addi a1, a1, 1071
+; RV32-NEXT:    sw a1, %lo(spool.splbuf)(a0)
+; RV32-NEXT:    ret
 ;
 ; RV64ALIGNED-LABEL: t6:
 ; RV64ALIGNED:       # %bb.0: # %entry
-; RV64ALIGNED-NEXT:    addi sp, sp, -16
-; RV64ALIGNED-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ALIGNED-NEXT:    lui a0, %hi(spool.splbuf)
-; RV64ALIGNED-NEXT:    addi a0, a0, %lo(spool.splbuf)
-; RV64ALIGNED-NEXT:    lui a1, %hi(.L.str6)
-; RV64ALIGNED-NEXT:    addi a1, a1, %lo(.L.str6)
-; RV64ALIGNED-NEXT:    li a2, 14
-; RV64ALIGNED-NEXT:    call memcpy@plt
-; RV64ALIGNED-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64ALIGNED-NEXT:    addi sp, sp, 16
+; RV64ALIGNED-NEXT:    li a1, 88
+; RV64ALIGNED-NEXT:    sh a1, %lo(spool.splbuf+12)(a0)
+; RV64ALIGNED-NEXT:    lui a1, %hi(.LCPI6_0)
+; RV64ALIGNED-NEXT:    ld a1, %lo(.LCPI6_0)(a1)
+; RV64ALIGNED-NEXT:    lui a2, 361862
+; RV64ALIGNED-NEXT:    addiw a2, a2, -1960
+; RV64ALIGNED-NEXT:    sw a2, %lo(spool.splbuf+8)(a0)
+; RV64ALIGNED-NEXT:    sd a1, %lo(spool.splbuf)(a0)
 ; RV64ALIGNED-NEXT:    ret
 ;
-; RV32UNALIGNED-LABEL: t6:
-; RV32UNALIGNED:       # %bb.0: # %entry
-; RV32UNALIGNED-NEXT:    lui a0, %hi(spool.splbuf)
-; RV32UNALIGNED-NEXT:    li a1, 88
-; RV32UNALIGNED-NEXT:    sh a1, %lo(spool.splbuf+12)(a0)
-; RV32UNALIGNED-NEXT:    lui a1, 361862
-; RV32UNALIGNED-NEXT:    addi a1, a1, -1960
-; RV32UNALIGNED-NEXT:    sw a1, %lo(spool.splbuf+8)(a0)
-; RV32UNALIGNED-NEXT:    lui a1, 362199
-; RV32UNALIGNED-NEXT:    addi a1, a1, 559
-; RV32UNALIGNED-NEXT:    sw a1, %lo(spool.splbuf+4)(a0)
-; RV32UNALIGNED-NEXT:    lui a1, 460503
-; RV32UNALIGNED-NEXT:    addi a1, a1, 1071
-; RV32UNALIGNED-NEXT:    sw a1, %lo(spool.splbuf)(a0)
-; RV32UNALIGNED-NEXT:    ret
-;
 ; RV64UNALIGNED-LABEL: t6:
 ; RV64UNALIGNED:       # %bb.0: # %entry
 ; RV64UNALIGNED-NEXT:    lui a0, %hi(.L.str6)

diff --git a/llvm/test/CodeGen/WebAssembly/bulk-memory.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
@@ -154,7 +154,7 @@ define void @memset_1024(ptr %dest, i8 %val) {
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]]
@@ -171,7 +171,7 @@ define void @memcpy_alloca_src(ptr %dst) {
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]]
@@ -188,7 +188,7 @@ define void @memcpy_alloca_dst(ptr %src) {
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]]

diff --git a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
@@ -157,7 +157,7 @@ define void @memset_1024(ptr %dest, i8 %val) {
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]]
@@ -174,7 +174,7 @@ define void @memcpy_alloca_src(ptr %dst) {
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]]
@@ -191,7 +191,7 @@ define void @memcpy_alloca_dst(ptr %src) {
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i64.const $push[[L1:[0-9]+]]=, 112
 ; BULK-MEM-NEXT: i64.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.const $push[[L3:[0-9]+]]=, 8
 ; BULK-MEM-NEXT: i64.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
 ; BULK-MEM-NEXT: i64.const $push[[L5:[0-9]+]]=, 100
 ; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]]

diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll b/llvm/test/CodeGen/X86/GlobalISel/x86_64-irtranslator-struct-return.ll
@@ -134,7 +134,7 @@ define i64 @test_return_i2(i64 %i.coerce) {
   ; ALL:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.retval
   ; ALL:   [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1.i
   ; ALL:   G_STORE [[COPY]](s64), [[FRAME_INDEX1]](p0) :: (store (s64) into %ir.0, align 4)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.1, align 4), (load (s8) from %ir.2, align 4)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.1, align 8), (load (s8) from %ir.2, align 8)
   ; ALL:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load (s64) from %ir.3, align 4)
   ; ALL:   $rax = COPY [[LOAD]](s64)
   ; ALL:   RET 0, implicit $rax
@@ -166,9 +166,9 @@ define { i64, i32 } @test_return_i3(i64 %i.coerce0, i32 %i.coerce1) {
   ; ALL:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; ALL:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX2]], [[C1]](s64)
   ; ALL:   G_STORE [[COPY1]](s32), [[PTR_ADD]](p0) :: (store (s32) into %ir.1)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX1]](p0), [[FRAME_INDEX2]](p0), [[C]](s64), 0 :: (store (s8) into %ir.2, align 4), (load (s8) from %ir.3, align 4)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.4, align 4), (load (s8) from %ir.5, align 4)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX3]](p0), [[FRAME_INDEX]](p0), [[C]](s64), 0 :: (store (s8) into %ir.6, align 8), (load (s8) from %ir.7, align 4)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX1]](p0), [[FRAME_INDEX2]](p0), [[C]](s64), 0 :: (store (s8) into %ir.2, align 8), (load (s8) from %ir.3, align 8)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.4, align 8), (load (s8) from %ir.5, align 8)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX3]](p0), [[FRAME_INDEX]](p0), [[C]](s64), 0 :: (store (s8) into %ir.6, align 8), (load (s8) from %ir.7, align 8)
   ; ALL:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX3]](p0) :: (dereferenceable load (s64) from %ir.tmp)
   ; ALL:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX3]], [[C1]](s64)
   ; ALL:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load (s32) from %ir.tmp + 8, align 8)
@@ -210,7 +210,7 @@ define { i64, i64 } @test_return_i4(i64 %i.coerce0, i64 %i.coerce1) {
   ; ALL:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
   ; ALL:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX1]], [[C1]](s64)
   ; ALL:   G_STORE [[COPY1]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.2, align 4)
-  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.3, align 4), (load (s8) from %ir.4, align 4)
+  ; ALL:   G_MEMCPY [[FRAME_INDEX]](p0), [[FRAME_INDEX1]](p0), [[C]](s64), 0 :: (store (s8) into %ir.3, align 8), (load (s8) from %ir.4, align 8)
   ; ALL:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load (s64) from %ir.5, align 4)
   ; ALL:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s64)
   ; ALL:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load (s64) from %ir.5 + 8, align 4)

diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/adjust-memintrin-alignment.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/adjust-memintrin-alignment.ll
@@ -0,0 +1,55 @@
+; RUN: opt -mtriple=riscv32 -data-layout="e-m:e-p:32:32" -S -codegenprepare < %s \
+; RUN:   | FileCheck %s '-D#NEW_ALIGNMENT=4'
+; RUN: opt -mtriple=riscv64 -data-layout="e-m:e-p:64:64" -S -codegenprepare < %s \
+; RUN:   | FileCheck %s '-D#NEW_ALIGNMENT=8'
+
+@str = private unnamed_addr constant [45 x i8] c"THIS IS A LONG STRING THAT SHOULD BE ALIGNED\00", align 1
+
+
+declare void @use(ptr %arg)
+
+
+; CHECK: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr constant [45 x i8] c"THIS IS A LONG STRING THAT SHOULD BE ALIGNED\00", align [[#NEW_ALIGNMENT]]
+
+define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DST:%.*]] = alloca [45 x i8], align [[#NEW_ALIGNMENT]]
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i32(ptr align [[#NEW_ALIGNMENT]] [[DST]], ptr align [[#NEW_ALIGNMENT]] dereferenceable(31) @str, i32 31, i1 false)
+; CHECK-NEXT:    ret void
+
+entry:
+  %dst = alloca [45 x i8], align 1
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %dst, ptr align 1 dereferenceable(31) @str, i32 31, i1 false)
+  ret void
+}
+
+; negative test - check that we don't align objects that are too small
+define void @no_align(ptr %src) {
+; CHECK-LABEL: @no_align(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 1 [[SRC:%.*]], i32 31, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = alloca [3 x i8], align 1
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %dst, ptr %src, i32 31, i1 false)
+  ret void
+}
+
+; negative test - check that minsize requires at least 8 byte object size
+define void @no_align_minsize(ptr %src) minsize {
+; CHECK-LABEL: @no_align_minsize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DST:%.*]] = alloca [7 x i8], align 1
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[DST]], ptr align 1 [[SRC:%.*]], i32 31, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = alloca [7 x i8], align 1
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %dst, ptr %src, i32 31, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1)