20 changes: 0 additions & 20 deletions llvm/lib/Target/ARM/ARMScheduleM3.td

This file was deleted.

119 changes: 119 additions & 0 deletions llvm/lib/Target/ARM/ARMScheduleM4.td
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
//==- ARMScheduleM4.td - Cortex-M4 Scheduling Definitions -*- tablegen -*-====//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the SchedRead/Write data for the ARM Cortex-M4 processor.
//
//===----------------------------------------------------------------------===//

def CortexM4Model : SchedMachineModel {
let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue
let MicroOpBufferSize = 0; // In-order
let LoadLatency = 2; // Latency when not pipelined, not pc-relative
let MispredictPenalty = 2; // Best case branch taken cost
let PostRAScheduler = 1;

let CompleteModel = 0;
}


// We model the entire cpu as a single pipeline with a BufferSize = 0 since
// Cortex-M4 is in-order.

def M4Unit : ProcResource<1> { let BufferSize = 0; }


let SchedModel = CortexM4Model in {

// Some definitions of latencies we apply to different instructions

class M4UnitL1<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 1; }
class M4UnitL2<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 2; }
class M4UnitL3<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 3; }
class M4UnitL14<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 14; }
def M4UnitL1_wr : SchedWriteRes<[M4Unit]> { let Latency = 1; }
def M4UnitL2_wr : SchedWriteRes<[M4Unit]> { let Latency = 2; }
class M4UnitL1I<dag instr> : InstRW<[M4UnitL1_wr], instr>;
class M4UnitL2I<dag instr> : InstRW<[M4UnitL2_wr], instr>;


// Loads, MAC's and DIV all get a higher latency of 2
def : M4UnitL2<WriteLd>;
def : M4UnitL2<WriteMAC32>;
def : M4UnitL2<WriteMAC64Hi>;
def : M4UnitL2<WriteMAC64Lo>;
def : M4UnitL2<WriteMAC16>;
def : M4UnitL2<WriteDIV>;

def : M4UnitL2I<(instregex "(t|t2)LDM")>;


// Stores we use a latency of 1 as they have no outputs

def : M4UnitL1<WriteST>;
def : M4UnitL1I<(instregex "(t|t2)STM")>;


// Everything else has a Latency of 1

def : M4UnitL1<WriteALU>;
def : M4UnitL1<WriteALUsi>;
def : M4UnitL1<WriteALUsr>;
def : M4UnitL1<WriteALUSsr>;
def : M4UnitL1<WriteBr>;
def : M4UnitL1<WriteBrL>;
def : M4UnitL1<WriteBrTbl>;
def : M4UnitL1<WriteCMPsi>;
def : M4UnitL1<WriteCMPsr>;
def : M4UnitL1<WriteCMP>;
def : M4UnitL1<WriteMUL32>;
def : M4UnitL1<WriteMUL64Hi>;
def : M4UnitL1<WriteMUL64Lo>;
def : M4UnitL1<WriteMUL16>;
def : M4UnitL1<WriteNoop>;
def : M4UnitL1<WritePreLd>;
def : M4UnitL1I<(instregex "(t|t2)MOV")>;
def : M4UnitL1I<(instrs COPY)>;
def : M4UnitL1I<(instregex "t2IT")>;
def : M4UnitL1I<(instregex "t2SEL", "t2USAD8",
"t2(S|Q|SH|U|UQ|UH)(ADD16|ASX|SAX|SUB16|ADD8|SUB8)", "t2USADA8", "(t|t2)REV")>;

def : ReadAdvance<ReadALU, 0>;
def : ReadAdvance<ReadALUsr, 0>;
def : ReadAdvance<ReadMUL, 0>;
def : ReadAdvance<ReadMAC, 0>;

// Most FP instructions are single-cycle latency, except MAC's, Div's and Sqrt's.
// Loads still take 2 cycles.

def : M4UnitL1<WriteFPCVT>;
def : M4UnitL1<WriteFPMOV>;
def : M4UnitL1<WriteFPALU32>;
def : M4UnitL1<WriteFPALU64>;
def : M4UnitL1<WriteFPMUL32>;
def : M4UnitL1<WriteFPMUL64>;
def : M4UnitL2I<(instregex "VLD")>;
def : M4UnitL1I<(instregex "VST")>;
def : M4UnitL3<WriteFPMAC32>;
def : M4UnitL3<WriteFPMAC64>;
def : M4UnitL14<WriteFPDIV32>;
def : M4UnitL14<WriteFPDIV64>;
def : M4UnitL14<WriteFPSQRT32>;
def : M4UnitL14<WriteFPSQRT64>;
def : M4UnitL1<WriteVLD1>;
def : M4UnitL1<WriteVLD2>;
def : M4UnitL1<WriteVLD3>;
def : M4UnitL1<WriteVLD4>;
def : M4UnitL1<WriteVST1>;
def : M4UnitL1<WriteVST2>;
def : M4UnitL1<WriteVST3>;
def : M4UnitL1<WriteVST4>;

def : ReadAdvance<ReadFPMUL, 0>;
def : ReadAdvance<ReadFPMAC, 0>;

}
30 changes: 15 additions & 15 deletions llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

; CHECK-LABEL: add_user
; CHECK: %for.body
; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]!
; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]!
; CHECK: sxtah [[COUNT:r[0-9]+]], [[COUNT]], [[A]]
; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
define i32 @add_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
entry:
%cmp24 = icmp sgt i32 %arg, 0
Expand Down Expand Up @@ -53,10 +53,10 @@ for.body:

; CHECK-LABEL: mul_bottom_user
; CHECK: %for.body
; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]!
; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]!
; CHECK: sxth [[SXT:r[0-9]+]], [[A]]
; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
; CHECK: mul [[COUNT:r[0-9]+]],{{.*}}[[SXT]]
define i32 @mul_bottom_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
entry:
Expand Down Expand Up @@ -104,8 +104,8 @@ for.body:
; CHECK: %for.body
; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]!
; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]!
; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
; CHECK: asr.w [[ASR:[rl0-9]+]], [[B]], #16
; CHECK: asrs [[ASR:[rl0-9]+]], [[A]], #16
; CHECK: smlad [[ACC:[rl0-9]+]], [[A]], [[B]], [[ACC]]
; CHECK: mul [[COUNT:[rl0-9]+]],{{.}}[[ASR]]
define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
entry:
Expand Down Expand Up @@ -151,10 +151,10 @@ for.body:

; CHECK-LABEL: and_user
; CHECK: %for.body
; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]!
; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]!
; CHECK: uxth [[UXT:r[0-9]+]], [[A]]
; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
; CHECK: mul [[MUL:r[0-9]+]],{{.*}}[[UXT]]
define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
entry:
Expand Down Expand Up @@ -201,12 +201,12 @@ for.body:

; CHECK-LABEL: multi_uses
; CHECK: %for.body
; CHECK: ldr [[A:r[0-9]+]], [{{.*}}, #2]!
; CHECK: ldr [[B:r[0-9]+]], [{{.*}}, #2]!
; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
; CHECK: ldr [[A:[rl0-9]+]], [{{.*}}, #2]!
; CHECK: ldr [[B:[rl0-9]+]], [{{.*}}, #2]!
; CHECK: sxth [[SXT:r[0-9]+]], [[A]]
; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
; CHECK: eor.w [[EOR:r[0-9]+]], [[SXT]], [[SHIFT:r[0-9]+]]
; CHECK: mul [[MUL:r[0-9]+]],{{.*}}[[SXT]]
; CHECK: muls [[MUL:r[0-9]+]],{{.*}}[[SXT]]
; CHECK: lsl.w [[SHIFT]], [[MUL]], #16
define i32 @multi_uses(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
entry:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ define arm_aapcs_vfpcc void @test_1double_nosplit([4 x float], [4 x double], [3

; CHECK-M4F-LABEL: test_1double_nosplit:
; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
; CHECK-M4F: movt [[ONEHI]], #16368
; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
; CHECK-M4F: strd [[ONELO]], [[ONEHI]], [sp]
; CHECK-M4F: bl test_1double_nosplit
call arm_aapcs_vfpcc void @test_1double_nosplit([4 x float] undef, [4 x double] undef, [3 x float] undef, double 1.0)
Expand All @@ -97,8 +97,8 @@ define arm_aapcs_vfpcc void @test_1double_misaligned([4 x double], [4 x double],

; CHECK-M4F-LABEL: test_1double_misaligned:
; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
; CHECK-M4F: movt [[ONEHI]], #16368
; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
; CHECK-M4F: strd [[ONELO]], [[ONEHI]], [sp, #8]
; CHECK-M4F: bl test_1double_misaligned

Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/ARM/useaa.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52 | FileCheck %s --check-prefix=CHECK --check-prefix=USEAA
; RUN: llc < %s -mtriple=armv7m-eabi -mcpu=cortex-m4 | FileCheck %s --check-prefix=CHECK --check-prefix=USEAA
; RUN: llc < %s -mtriple=armv8m-eabi -mcpu=cortex-m33 | FileCheck %s --check-prefix=CHECK --check-prefix=USEAA
; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC

; Check we use AA during codegen, so can interleave these loads/stores.
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,10 @@ if.end:
; CHECK-BP: str
; CHECK-BP: b
; CHECK-BP: str
; CHECK-BP: ldr
; CHECK-BP: add
; CHECK-NOBP: ittee
; CHECK-NOBP: streq
; CHECK-NOBP: ldreq
; CHECK-NOBP: addeq
; CHECK-NOBP: strne
; CHECK-NOBP: strne
define i32 @diamond2(i32 %n, i32* %p, i32* %q) {
Expand All @@ -119,7 +119,7 @@ if.then:

if.else:
store i32 %n, i32* %q, align 4
%0 = load i32, i32* %p, align 4
%0 = add i32 %n, 10
br label %if.end

if.end:
Expand Down
60 changes: 60 additions & 0 deletions llvm/test/CodeGen/Thumb2/m4-sched-ldr.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# RUN: llc %s -run-pass machine-scheduler -o - | FileCheck %s

# CHECK-LABEL: bb.0.
# CHECK: t2LDRi12
# CHECK-NEXT: t2LDRi12
# CHECK-NEXT: t2ADDri
# CHECK-NEXT: t2ADDri
--- |
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv7em-arm-none-eabi"

; Function Attrs: norecurse nounwind optsize readonly
define dso_local i32 @test(i32* nocapture readonly %a, i32* nocapture readonly %b) local_unnamed_addr #0 {
entry:
%0 = load i32, i32* %a, align 4
%add = add nsw i32 %0, 10
%1 = load i32, i32* %b, align 4
%add1 = add nsw i32 %1, 20
%mul = mul nsw i32 %add1, %add
ret i32 %mul
}

attributes #0 = { "target-cpu"="cortex-m4" }

...
---
name: test
alignment: 1
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gprnopc, preferred-register: '' }
- { id: 3, class: rgpr, preferred-register: '' }
- { id: 4, class: gprnopc, preferred-register: '' }
- { id: 5, class: rgpr, preferred-register: '' }
- { id: 6, class: rgpr, preferred-register: '' }
liveins:
- { reg: '$r0', virtual-reg: '%0' }
- { reg: '$r1', virtual-reg: '%1' }
body: |
bb.0.entry:
liveins: $r0, $r1
%1:gpr = COPY $r1
%0:gpr = COPY $r0
%2:gprnopc = t2LDRi12 %0, 0, 14, $noreg :: (load 4 from %ir.a)
%3:rgpr = nsw t2ADDri %2, 10, 14, $noreg, $noreg
%4:gprnopc = t2LDRi12 %1, 0, 14, $noreg :: (load 4 from %ir.b)
%5:rgpr = nsw t2ADDri %4, 20, 14, $noreg, $noreg
%6:rgpr = nsw t2MUL %5, %3, 14, $noreg
$r0 = COPY %6
tBX_RET 14, $noreg, implicit $r0
...
52 changes: 52 additions & 0 deletions llvm/test/CodeGen/Thumb2/m4-sched-regs.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc %s -o - | FileCheck %s

target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv7em-arm-none-eabi"

%struct.a = type { i32, %struct.b*, i8, i8, i8, i8, i8*, %struct.b*, i16, i16, i16, i16, i16, i16, i16, i16, i32, i32, i32, i32, i32, i32, i32 }
%struct.b = type { i8, i8, i8, i8, i32, i16, i16, i32, i32, i32, i32, [16 x i8], [64 x i8], [128 x i8], i32, [68 x i8] }

define void @test(%struct.a* nocapture %dhcp, i16 zeroext %value) #0 {
; CHECK-LABEL: test:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: ldrh r3, [r0, #20]
; CHECK-NEXT: ldr.w lr, [r0, #16]
; CHECK-NEXT: lsr.w r12, r1, #8
; CHECK-NEXT: adds r2, r3, #1
; CHECK-NEXT: strh r2, [r0, #20]
; CHECK-NEXT: add.w r2, lr, r3
; CHECK-NEXT: strb.w r12, [r2, #240]
; CHECK-NEXT: ldrh r2, [r0, #20]
; CHECK-NEXT: ldr.w r12, [r0, #16]
; CHECK-NEXT: adds r3, r2, #1
; CHECK-NEXT: strh r3, [r0, #20]
; CHECK-NEXT: add.w r0, r12, r2
; CHECK-NEXT: strb.w r1, [r0, #240]
; CHECK-NEXT: pop {r7, pc}
entry:
%shr = lshr i16 %value, 8
%conv1 = trunc i16 %shr to i8
%msg_out = getelementptr inbounds %struct.a, %struct.a* %dhcp, i32 0, i32 7
%0 = load %struct.b*, %struct.b** %msg_out, align 4
%options_out_len = getelementptr inbounds %struct.a, %struct.a* %dhcp, i32 0, i32 8
%1 = load i16, i16* %options_out_len, align 4
%inc = add i16 %1, 1
store i16 %inc, i16* %options_out_len, align 4
%idxprom = zext i16 %1 to i32
%arrayidx = getelementptr inbounds %struct.b, %struct.b* %0, i32 0, i32 15, i32 %idxprom
store i8 %conv1, i8* %arrayidx, align 1
%conv4 = trunc i16 %value to i8
%2 = load %struct.b*, %struct.b** %msg_out, align 4
%3 = load i16, i16* %options_out_len, align 4
%inc8 = add i16 %3, 1
store i16 %inc8, i16* %options_out_len, align 4
%idxprom9 = zext i16 %3 to i32
%arrayidx10 = getelementptr inbounds %struct.b, %struct.b* %2, i32 0, i32 15, i32 %idxprom9
store i8 %conv4, i8* %arrayidx10, align 1
ret void
}

attributes #0 = { minsize optsize "target-cpu"="cortex-m4" }