From b051389f049bc195877077430a1300fd3fc75582 Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@google.com>
Date: Thu, 26 Mar 2015 22:11:00 +0000
Subject: [PATCH] Use movw/movt instead of constant pool loads to lower byval
 parameter copies

Summary:
The ARM backend can use a loop to implement copying byval parameters before
a call. In non-thumb2 mode it uses a constant pool load to materialize the
trip count. For targets that need movt instead (e.g. Native Client), use
the same code as in thumb2 mode to materialize the trip count.

Reviewers: jfb, t.p.northover

Differential Revision: http://reviews.llvm.org/D8442

llvm-svn: 233324
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 14 +++++++++-----
 llvm/test/CodeGen/ARM/struct_byval.ll   | 17 +++++++++++++++++
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 77a8a7c51b31f..3b1b8dd1b8ee4 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -7105,16 +7105,20 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
 
   // Load an immediate to varEnd.
   unsigned varEnd = MRI.createVirtualRegister(TRC);
-  if (IsThumb2) {
+  if (Subtarget->useMovt(*MF)) {
     unsigned Vtmp = varEnd;
     if ((LoopSize & 0xFFFF0000) != 0)
       Vtmp = MRI.createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), Vtmp)
-                       .addImm(LoopSize & 0xFFFF));
+    AddDefaultPred(BuildMI(BB, dl,
+                           TII->get(IsThumb2 ? ARM::t2MOVi16 : ARM::MOVi16),
+                           Vtmp).addImm(LoopSize & 0xFFFF));
 
     if ((LoopSize & 0xFFFF0000) != 0)
-      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
-                         .addReg(Vtmp).addImm(LoopSize >> 16));
+      AddDefaultPred(BuildMI(BB, dl,
+                             TII->get(IsThumb2 ? ARM::t2MOVTi16 : ARM::MOVTi16),
+                             varEnd)
+                         .addReg(Vtmp)
+                         .addImm(LoopSize >> 16));
   } else {
     MachineConstantPool *ConstantPool = MF->getConstantPool();
     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
diff --git a/llvm/test/CodeGen/ARM/struct_byval.ll b/llvm/test/CodeGen/ARM/struct_byval.ll
index 8a04e4d82b2f3..d7b9b477ec1ef 100644
--- a/llvm/test/CodeGen/ARM/struct_byval.ll
+++ b/llvm/test/CodeGen/ARM/struct_byval.ll
@@ -1,5 +1,9 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios6.0 | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios6.0 | FileCheck %s -check-prefix=THUMB
+; RUN: llc < %s -mtriple=armv7-unknown-nacl-gnueabi | FileCheck %s -check-prefix=NACL
+; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi | FileCheck %s -check-prefix=NOMOVT
+
+; NOMOVT-NOT: movt
 
 ; rdar://9877866
 %struct.SmallStruct = type { i32, [8 x i32], [37 x i8] }
@@ -33,6 +37,14 @@ entry:
 ; THUMB: sub
 ; THUMB: str
 ; THUMB: bne
+; NACL-LABEL: g:
+; Ensure that use movw instead of constpool for the loop trip count. But don't
+; match the __stack_chk_guard movw
+; NACL: movw r{{[1-9]}}, #
+; NACL: ldr
+; NACL: sub
+; NACL: str
+; NACL: bne
   %st = alloca %struct.LargeStruct, align 4
   %call = call i32 @e2(%struct.LargeStruct* byval %st)
   ret i32 0
@@ -51,6 +63,11 @@ entry:
 ; THUMB: sub
 ; THUMB: vst1
 ; THUMB: bne
+; NACL: movw r{{[1-9]}}, #
+; NACL: vld1
+; NACL: sub
+; NACL: vst1
+; NACL: bne
   %st = alloca %struct.LargeStruct, align 16
   %call = call i32 @e3(%struct.LargeStruct* byval align 16 %st)
   ret i32 0