[ARM] Deliberately prevent inline asm in low overhead loops. NFC

This was already something that was handled by one of the "else" branches in maybeLoweredToCall, so this patch is an NFC but makes it explicit and adds a test. We may in the future want to support this under certain situations but for the moment just don't try and create low overhead loops with inline asm in them. Differential Revision: https://reviews.llvm.org/D91257
llvm · Nov 19, 2020 · 006b3bd · 006b3bd
1 parent 1407833
commit 006b3bd
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 1 deletion.
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1694,7 +1694,8 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
   auto ScanLoop = [&](Loop *L) {
     for (auto *BB : L->getBlocks()) {
       for (auto &I : *BB) {
-        if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I)) {
+        if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
+            isa<InlineAsm>(I)) {
           LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
           return false;
         }

diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/inlineasm.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
+
+define i32 @test(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: test:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    blt .LBB0_4
+; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    mov lr, r0
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:  .LBB0_2: @ %for.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrh r3, [r1], #2
+; CHECK-NEXT:    subs r2, #1
+; CHECK-NEXT:    ldrh r12, [lr], #2
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    add r3, r12
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    add r0, r3
+; CHECK-NEXT:    bne .LBB0_2
+; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %s.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %s.011 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.010
+  %0 = load i16, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.010
+  %1 = load i16, i16* %arrayidx1, align 2
+  %2 = tail call i32 asm "add $0, $1, $2", "=r,r,r"(i16 %0, i16 %1) #1
+  %add = add nsw i32 %2, %s.011
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define i32 @testlr(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: testlr:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    blt .LBB1_4
+; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
+; CHECK-NEXT:    mov r3, r0
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:  .LBB1_2: @ %for.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrh r4, [r1], #2
+; CHECK-NEXT:    subs r2, #1
+; CHECK-NEXT:    ldrh r12, [r3], #2
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    add r4, r12
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    add r0, r4
+; CHECK-NEXT:    bne .LBB1_2
+; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %s.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %s.011 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.010
+  %0 = load i16, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.010
+  %1 = load i16, i16* %arrayidx1, align 2
+  %2 = tail call i32 asm "add $0, $1, $2", "=r,r,r,~{lr}"(i16 %0, i16 %1) #1
+  %add = add nsw i32 %2, %s.011
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}