diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index cf2b32c74eb5a5..355f5a217a9af6 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2252,19 +2252,19 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
           DL->getTypeAllocSize(GV->getValueType()) >= MinSize + Offset2)
         GV->setAlignment(PrefAlign);
     }
-    // If this is a memcpy (or similar) then we may be able to improve the
-    // alignment
-    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
-      Align DestAlign = getKnownAlignment(MI->getDest(), *DL);
-      MaybeAlign MIDestAlign = MI->getDestAlign();
-      if (!MIDestAlign || DestAlign > *MIDestAlign)
-        MI->setDestAlignment(DestAlign);
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
-        MaybeAlign MTISrcAlign = MTI->getSourceAlign();
-        Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
-        if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
-          MTI->setSourceAlignment(SrcAlign);
-      }
+  }
+  // If this is a memcpy (or similar) then we may be able to improve the
+  // alignment.
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
+    Align DestAlign = getKnownAlignment(MI->getDest(), *DL);
+    MaybeAlign MIDestAlign = MI->getDestAlign();
+    if (!MIDestAlign || DestAlign > *MIDestAlign)
+      MI->setDestAlignment(DestAlign);
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+      MaybeAlign MTISrcAlign = MTI->getSourceAlign();
+      Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
+      if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
+        MTI->setSourceAlignment(SrcAlign);
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index 77c2205018f95e..ee1bc4376a1c99 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -19,16 +19,16 @@ define protected amdgpu_kernel void @test(i8 addrspace(1)* nocapture %ptr.coerce
 ; GCN-LABEL: test:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    ds_read_u8 v1, v0 offset:1
+; GCN-NEXT:    v_mov_b32_e32 v1, 2
+; GCN-NEXT:    ds_write_b8 v0, v1
 ; GCN-NEXT:    ds_read_u8 v2, v0 offset:2
+; GCN-NEXT:    ds_read_u16 v3, v0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v3, 2
-; GCN-NEXT:    ds_write_b8 v0, v3
-; GCN-NEXT:    ds_write_b8 v0, v3 offset:4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    ds_write_b8 v0, v1 offset:5
 ; GCN-NEXT:    ds_write_b8 v0, v2 offset:6
-; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    ds_write_b16 v0, v3 offset:4
+; GCN-NEXT:    v_cmp_eq_u16_sdwa s[2:3], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
 ; GCN-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/CodeGen/X86/mcu-abi.ll b/llvm/test/CodeGen/X86/mcu-abi.ll
index fbec4dc813c2fd..53c228943d9148 100644
--- a/llvm/test/CodeGen/X86/mcu-abi.ll
+++ b/llvm/test/CodeGen/X86/mcu-abi.ll
@@ -64,13 +64,14 @@ entry:
 define void @ret_large_struct(ptr noalias nocapture sret(%struct.st12_t) %agg.result, ptr byval(%struct.st12_t) nocapture readonly align 4 %r) #0 {
 ; CHECK-LABEL: ret_large_struct:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    movl %eax, %esi
-; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl $48, %ecx
-; CHECK-NEXT:    calll memcpy
-; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl $12, %ecx
+; CHECK-NEXT:    movl %eax, %edi
+; CHECK-NEXT:    rep;movsl (%esi), %es:(%edi)
 ; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 entry:
   call void @llvm.memcpy.p0.p0.i32(ptr %agg.result, ptr %r, i32 48, i1 false)
diff --git a/llvm/test/CodeGen/X86/memset-2.ll b/llvm/test/CodeGen/X86/memset-2.ll
index 72ef36fc82733c..a4fbbc57004708 100644
--- a/llvm/test/CodeGen/X86/memset-2.ll
+++ b/llvm/test/CodeGen/X86/memset-2.ll
@@ -1,31 +1,33 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=yonah < %s | FileCheck %s
 
-define fastcc void @t1() nounwind {
+define fastcc void @t1(ptr nocapture %s) nounwind {
 ; CHECK-LABEL: t1:
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    subl $16, %esp
 ; CHECK-NEXT:    pushl $188
 ; CHECK-NEXT:    pushl $0
-; CHECK-NEXT:    pushl $0
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    calll _memset
 ; CHECK-NEXT:    addl $16, %esp
 ; CHECK-NEXT:    ud2
 entry:
-  call void @llvm.memset.p0.i32(ptr null, i8 0, i32 188, i1 false)
+  call void @llvm.memset.p0.i32(ptr %s, i8 0, i32 188, i1 false)
   unreachable
 }
 
-define fastcc void @t2(i8 signext %c) nounwind {
+define fastcc void @t2(ptr nocapture %s, i8 signext %c) nounwind {
 ; CHECK-LABEL: t2:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    subl $12, %esp
-; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl $76, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    subl $16, %esp
+; CHECK-NEXT:    pushl $76
+; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    calll _memset
+; CHECK-NEXT:    addl $16, %esp
 ; CHECK-NEXT:    ud2
 entry:
-  call void @llvm.memset.p0.i32(ptr undef, i8 %c, i32 76, i1 false)
+  call void @llvm.memset.p0.i32(ptr %s, i8 %c, i32 76, i1 false)
   unreachable
 }
 
diff --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
index 7b939dfb425967..c6eecdcdf99cc5 100644
--- a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -3,55 +3,57 @@
 ; RUN: llc < %s -mtriple=i386-unknown-unknown   -mattr=ssse3   | FileCheck %s --check-prefix=SLOW_32
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=ssse3   | FileCheck %s --check-prefix=SLOW_64
 
-define void @bork() nounwind {
+define void @bork(ptr nocapture align 4 %dst) nounwind {
 ; FAST-LABEL: bork:
 ; FAST:       # %bb.0:
+; FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FAST-NEXT:    xorps %xmm0, %xmm0
-; FAST-NEXT:    movups %xmm0, 64
-; FAST-NEXT:    movups %xmm0, 48
-; FAST-NEXT:    movups %xmm0, 32
-; FAST-NEXT:    movups %xmm0, 16
-; FAST-NEXT:    movups %xmm0, 0
+; FAST-NEXT:    movups %xmm0, 64(%eax)
+; FAST-NEXT:    movups %xmm0, 48(%eax)
+; FAST-NEXT:    movups %xmm0, 32(%eax)
+; FAST-NEXT:    movups %xmm0, 16(%eax)
+; FAST-NEXT:    movups %xmm0, (%eax)
 ; FAST-NEXT:    retl
 ;
 ; SLOW_32-LABEL: bork:
 ; SLOW_32:       # %bb.0:
-; SLOW_32-NEXT:    movl $0, 4
-; SLOW_32-NEXT:    movl $0, 0
-; SLOW_32-NEXT:    movl $0, 12
-; SLOW_32-NEXT:    movl $0, 8
-; SLOW_32-NEXT:    movl $0, 20
-; SLOW_32-NEXT:    movl $0, 16
-; SLOW_32-NEXT:    movl $0, 28
-; SLOW_32-NEXT:    movl $0, 24
-; SLOW_32-NEXT:    movl $0, 36
-; SLOW_32-NEXT:    movl $0, 32
-; SLOW_32-NEXT:    movl $0, 44
-; SLOW_32-NEXT:    movl $0, 40
-; SLOW_32-NEXT:    movl $0, 52
-; SLOW_32-NEXT:    movl $0, 48
-; SLOW_32-NEXT:    movl $0, 60
-; SLOW_32-NEXT:    movl $0, 56
-; SLOW_32-NEXT:    movl $0, 68
-; SLOW_32-NEXT:    movl $0, 64
-; SLOW_32-NEXT:    movl $0, 76
-; SLOW_32-NEXT:    movl $0, 72
+; SLOW_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SLOW_32-NEXT:    movl $0, 4(%eax)
+; SLOW_32-NEXT:    movl $0, (%eax)
+; SLOW_32-NEXT:    movl $0, 12(%eax)
+; SLOW_32-NEXT:    movl $0, 8(%eax)
+; SLOW_32-NEXT:    movl $0, 20(%eax)
+; SLOW_32-NEXT:    movl $0, 16(%eax)
+; SLOW_32-NEXT:    movl $0, 28(%eax)
+; SLOW_32-NEXT:    movl $0, 24(%eax)
+; SLOW_32-NEXT:    movl $0, 36(%eax)
+; SLOW_32-NEXT:    movl $0, 32(%eax)
+; SLOW_32-NEXT:    movl $0, 44(%eax)
+; SLOW_32-NEXT:    movl $0, 40(%eax)
+; SLOW_32-NEXT:    movl $0, 52(%eax)
+; SLOW_32-NEXT:    movl $0, 48(%eax)
+; SLOW_32-NEXT:    movl $0, 60(%eax)
+; SLOW_32-NEXT:    movl $0, 56(%eax)
+; SLOW_32-NEXT:    movl $0, 68(%eax)
+; SLOW_32-NEXT:    movl $0, 64(%eax)
+; SLOW_32-NEXT:    movl $0, 76(%eax)
+; SLOW_32-NEXT:    movl $0, 72(%eax)
 ; SLOW_32-NEXT:    retl
 ;
 ; SLOW_64-LABEL: bork:
 ; SLOW_64:       # %bb.0:
-; SLOW_64-NEXT:    movq $0, 72
-; SLOW_64-NEXT:    movq $0, 64
-; SLOW_64-NEXT:    movq $0, 56
-; SLOW_64-NEXT:    movq $0, 48
-; SLOW_64-NEXT:    movq $0, 40
-; SLOW_64-NEXT:    movq $0, 32
-; SLOW_64-NEXT:    movq $0, 24
-; SLOW_64-NEXT:    movq $0, 16
-; SLOW_64-NEXT:    movq $0, 8
-; SLOW_64-NEXT:    movq $0, 0
+; SLOW_64-NEXT:    movq $0, 72(%rdi)
+; SLOW_64-NEXT:    movq $0, 64(%rdi)
+; SLOW_64-NEXT:    movq $0, 56(%rdi)
+; SLOW_64-NEXT:    movq $0, 48(%rdi)
+; SLOW_64-NEXT:    movq $0, 40(%rdi)
+; SLOW_64-NEXT:    movq $0, 32(%rdi)
+; SLOW_64-NEXT:    movq $0, 24(%rdi)
+; SLOW_64-NEXT:    movq $0, 16(%rdi)
+; SLOW_64-NEXT:    movq $0, 8(%rdi)
+; SLOW_64-NEXT:    movq $0, (%rdi)
 ; SLOW_64-NEXT:    retq
-  call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 80, i1 false)
+  call void @llvm.memset.p0.i64(ptr align 4 %dst, i8 0, i64 80, i1 false)
   ret void
 }