| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,119 @@ | ||
| //==- ARMScheduleM4.td - Cortex-M4 Scheduling Definitions -*- tablegen -*-====// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // This file defines the SchedRead/Write data for the ARM Cortex-M4 processor. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| def CortexM4Model : SchedMachineModel { | ||
| let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue | ||
| let MicroOpBufferSize = 0; // In-order | ||
| let LoadLatency = 2; // Latency when not pipelined, not pc-relative | ||
| let MispredictPenalty = 2; // Best case branch taken cost | ||
| let PostRAScheduler = 1; | ||
|
|
||
| let CompleteModel = 0; | ||
| } | ||
|
|
||
|
|
||
| // We model the entire cpu as a single pipeline with a BufferSize = 0 since | ||
| // Cortex-M4 is in-order. | ||
|
|
||
| def M4Unit : ProcResource<1> { let BufferSize = 0; } | ||
|
|
||
|
|
||
| let SchedModel = CortexM4Model in { | ||
|
|
||
| // Some definitions of latencies we apply to different instructions | ||
|
|
||
| class M4UnitL1<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 1; } | ||
| class M4UnitL2<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 2; } | ||
| class M4UnitL3<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 3; } | ||
| class M4UnitL14<SchedWrite write> : WriteRes<write, [M4Unit]> { let Latency = 14; } | ||
| def M4UnitL1_wr : SchedWriteRes<[M4Unit]> { let Latency = 1; } | ||
| def M4UnitL2_wr : SchedWriteRes<[M4Unit]> { let Latency = 2; } | ||
| class M4UnitL1I<dag instr> : InstRW<[M4UnitL1_wr], instr>; | ||
| class M4UnitL2I<dag instr> : InstRW<[M4UnitL2_wr], instr>; | ||
|
|
||
|
|
||
| // Loads, MAC's and DIV all get a higher latency of 2 | ||
| def : M4UnitL2<WriteLd>; | ||
| def : M4UnitL2<WriteMAC32>; | ||
| def : M4UnitL2<WriteMAC64Hi>; | ||
| def : M4UnitL2<WriteMAC64Lo>; | ||
| def : M4UnitL2<WriteMAC16>; | ||
| def : M4UnitL2<WriteDIV>; | ||
|
|
||
| def : M4UnitL2I<(instregex "(t|t2)LDM")>; | ||
|
|
||
|
|
||
| // Stores we use a latency of 1 as they have no outputs | ||
|
|
||
| def : M4UnitL1<WriteST>; | ||
| def : M4UnitL1I<(instregex "(t|t2)STM")>; | ||
|
|
||
|
|
||
| // Everything else has a Latency of 1 | ||
|
|
||
| def : M4UnitL1<WriteALU>; | ||
| def : M4UnitL1<WriteALUsi>; | ||
| def : M4UnitL1<WriteALUsr>; | ||
| def : M4UnitL1<WriteALUSsr>; | ||
| def : M4UnitL1<WriteBr>; | ||
| def : M4UnitL1<WriteBrL>; | ||
| def : M4UnitL1<WriteBrTbl>; | ||
| def : M4UnitL1<WriteCMPsi>; | ||
| def : M4UnitL1<WriteCMPsr>; | ||
| def : M4UnitL1<WriteCMP>; | ||
| def : M4UnitL1<WriteMUL32>; | ||
| def : M4UnitL1<WriteMUL64Hi>; | ||
| def : M4UnitL1<WriteMUL64Lo>; | ||
| def : M4UnitL1<WriteMUL16>; | ||
| def : M4UnitL1<WriteNoop>; | ||
| def : M4UnitL1<WritePreLd>; | ||
| def : M4UnitL1I<(instregex "(t|t2)MOV")>; | ||
| def : M4UnitL1I<(instrs COPY)>; | ||
| def : M4UnitL1I<(instregex "t2IT")>; | ||
| def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", | ||
| "t2(S|Q|SH|U|UQ|UH)(ADD16|ASX|SAX|SUB16|ADD8|SUB8)", "t2USADA8", "(t|t2)REV")>; | ||
|
|
||
| def : ReadAdvance<ReadALU, 0>; | ||
| def : ReadAdvance<ReadALUsr, 0>; | ||
| def : ReadAdvance<ReadMUL, 0>; | ||
| def : ReadAdvance<ReadMAC, 0>; | ||
|
|
||
| // Most FP instructions are single-cycle latency, except MAC's, Div's and Sqrt's. | ||
| // Loads still take 2 cycles. | ||
|
|
||
| def : M4UnitL1<WriteFPCVT>; | ||
| def : M4UnitL1<WriteFPMOV>; | ||
| def : M4UnitL1<WriteFPALU32>; | ||
| def : M4UnitL1<WriteFPALU64>; | ||
| def : M4UnitL1<WriteFPMUL32>; | ||
| def : M4UnitL1<WriteFPMUL64>; | ||
| def : M4UnitL2I<(instregex "VLD")>; | ||
| def : M4UnitL1I<(instregex "VST")>; | ||
| def : M4UnitL3<WriteFPMAC32>; | ||
| def : M4UnitL3<WriteFPMAC64>; | ||
| def : M4UnitL14<WriteFPDIV32>; | ||
| def : M4UnitL14<WriteFPDIV64>; | ||
| def : M4UnitL14<WriteFPSQRT32>; | ||
| def : M4UnitL14<WriteFPSQRT64>; | ||
| def : M4UnitL1<WriteVLD1>; | ||
| def : M4UnitL1<WriteVLD2>; | ||
| def : M4UnitL1<WriteVLD3>; | ||
| def : M4UnitL1<WriteVLD4>; | ||
| def : M4UnitL1<WriteVST1>; | ||
| def : M4UnitL1<WriteVST2>; | ||
| def : M4UnitL1<WriteVST3>; | ||
| def : M4UnitL1<WriteVST4>; | ||
|
|
||
| def : ReadAdvance<ReadFPMUL, 0>; | ||
| def : ReadAdvance<ReadFPMAC, 0>; | ||
|
|
||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| # RUN: llc %s -run-pass machine-scheduler -o - | FileCheck %s | ||
|
|
||
| # CHECK-LABEL: bb.0. | ||
| # CHECK: t2LDRi12 | ||
| # CHECK-NEXT: t2LDRi12 | ||
| # CHECK-NEXT: t2ADDri | ||
| # CHECK-NEXT: t2ADDri | ||
| --- | | ||
| target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" | ||
| target triple = "thumbv7em-arm-none-eabi" | ||
|
|
||
| ; Function Attrs: norecurse nounwind optsize readonly | ||
| define dso_local i32 @test(i32* nocapture readonly %a, i32* nocapture readonly %b) local_unnamed_addr #0 { | ||
| entry: | ||
| %0 = load i32, i32* %a, align 4 | ||
| %add = add nsw i32 %0, 10 | ||
| %1 = load i32, i32* %b, align 4 | ||
| %add1 = add nsw i32 %1, 20 | ||
| %mul = mul nsw i32 %add1, %add | ||
| ret i32 %mul | ||
| } | ||
|
|
||
| attributes #0 = { "target-cpu"="cortex-m4" } | ||
|
|
||
| ... | ||
| --- | ||
| name: test | ||
| alignment: 1 | ||
| exposesReturnsTwice: false | ||
| legalized: false | ||
| regBankSelected: false | ||
| selected: false | ||
| failedISel: false | ||
| tracksRegLiveness: true | ||
| registers: | ||
| - { id: 0, class: gpr, preferred-register: '' } | ||
| - { id: 1, class: gpr, preferred-register: '' } | ||
| - { id: 2, class: gprnopc, preferred-register: '' } | ||
| - { id: 3, class: rgpr, preferred-register: '' } | ||
| - { id: 4, class: gprnopc, preferred-register: '' } | ||
| - { id: 5, class: rgpr, preferred-register: '' } | ||
| - { id: 6, class: rgpr, preferred-register: '' } | ||
| liveins: | ||
| - { reg: '$r0', virtual-reg: '%0' } | ||
| - { reg: '$r1', virtual-reg: '%1' } | ||
| body: | | ||
| bb.0.entry: | ||
| liveins: $r0, $r1 | ||
| %1:gpr = COPY $r1 | ||
| %0:gpr = COPY $r0 | ||
| %2:gprnopc = t2LDRi12 %0, 0, 14, $noreg :: (load 4 from %ir.a) | ||
| %3:rgpr = nsw t2ADDri %2, 10, 14, $noreg, $noreg | ||
| %4:gprnopc = t2LDRi12 %1, 0, 14, $noreg :: (load 4 from %ir.b) | ||
| %5:rgpr = nsw t2ADDri %4, 20, 14, $noreg, $noreg | ||
| %6:rgpr = nsw t2MUL %5, %3, 14, $noreg | ||
| $r0 = COPY %6 | ||
| tBX_RET 14, $noreg, implicit $r0 | ||
| ... |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
| ; RUN: llc %s -o - | FileCheck %s | ||
|
|
||
| target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" | ||
| target triple = "thumbv7em-arm-none-eabi" | ||
|
|
||
| %struct.a = type { i32, %struct.b*, i8, i8, i8, i8, i8*, %struct.b*, i16, i16, i16, i16, i16, i16, i16, i16, i32, i32, i32, i32, i32, i32, i32 } | ||
| %struct.b = type { i8, i8, i8, i8, i32, i16, i16, i32, i32, i32, i32, [16 x i8], [64 x i8], [128 x i8], i32, [68 x i8] } | ||
|
|
||
| define void @test(%struct.a* nocapture %dhcp, i16 zeroext %value) #0 { | ||
| ; CHECK-LABEL: test: | ||
| ; CHECK: @ %bb.0: @ %entry | ||
| ; CHECK-NEXT: .save {r7, lr} | ||
| ; CHECK-NEXT: push {r7, lr} | ||
| ; CHECK-NEXT: ldrh r3, [r0, #20] | ||
| ; CHECK-NEXT: ldr.w lr, [r0, #16] | ||
| ; CHECK-NEXT: lsr.w r12, r1, #8 | ||
| ; CHECK-NEXT: adds r2, r3, #1 | ||
| ; CHECK-NEXT: strh r2, [r0, #20] | ||
| ; CHECK-NEXT: add.w r2, lr, r3 | ||
| ; CHECK-NEXT: strb.w r12, [r2, #240] | ||
| ; CHECK-NEXT: ldrh r2, [r0, #20] | ||
| ; CHECK-NEXT: ldr.w r12, [r0, #16] | ||
| ; CHECK-NEXT: adds r3, r2, #1 | ||
| ; CHECK-NEXT: strh r3, [r0, #20] | ||
| ; CHECK-NEXT: add.w r0, r12, r2 | ||
| ; CHECK-NEXT: strb.w r1, [r0, #240] | ||
| ; CHECK-NEXT: pop {r7, pc} | ||
| entry: | ||
| %shr = lshr i16 %value, 8 | ||
| %conv1 = trunc i16 %shr to i8 | ||
| %msg_out = getelementptr inbounds %struct.a, %struct.a* %dhcp, i32 0, i32 7 | ||
| %0 = load %struct.b*, %struct.b** %msg_out, align 4 | ||
| %options_out_len = getelementptr inbounds %struct.a, %struct.a* %dhcp, i32 0, i32 8 | ||
| %1 = load i16, i16* %options_out_len, align 4 | ||
| %inc = add i16 %1, 1 | ||
| store i16 %inc, i16* %options_out_len, align 4 | ||
| %idxprom = zext i16 %1 to i32 | ||
| %arrayidx = getelementptr inbounds %struct.b, %struct.b* %0, i32 0, i32 15, i32 %idxprom | ||
| store i8 %conv1, i8* %arrayidx, align 1 | ||
| %conv4 = trunc i16 %value to i8 | ||
| %2 = load %struct.b*, %struct.b** %msg_out, align 4 | ||
| %3 = load i16, i16* %options_out_len, align 4 | ||
| %inc8 = add i16 %3, 1 | ||
| store i16 %inc8, i16* %options_out_len, align 4 | ||
| %idxprom9 = zext i16 %3 to i32 | ||
| %arrayidx10 = getelementptr inbounds %struct.b, %struct.b* %2, i32 0, i32 15, i32 %idxprom9 | ||
| store i8 %conv4, i8* %arrayidx10, align 1 | ||
| ret void | ||
| } | ||
|
|
||
| attributes #0 = { minsize optsize "target-cpu"="cortex-m4" } |