414 changes: 414 additions & 0 deletions llvm/lib/Target/AVR/AVRISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
setOperationAction(ISD::SRA, MVT::i16, Custom);
setOperationAction(ISD::SHL, MVT::i16, Custom);
setOperationAction(ISD::SRL, MVT::i16, Custom);
setOperationAction(ISD::SRA, MVT::i32, Custom);
setOperationAction(ISD::SHL, MVT::i32, Custom);
setOperationAction(ISD::SRL, MVT::i32, Custom);
setOperationAction(ISD::SHL_PARTS, MVT::i16, Expand);
setOperationAction(ISD::SRA_PARTS, MVT::i16, Expand);
setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand);
Expand Down Expand Up @@ -247,10 +250,13 @@ const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE(CALL);
NODE(WRAPPER);
NODE(LSL);
NODE(LSLW);
NODE(LSR);
NODE(LSRW);
NODE(ROL);
NODE(ROR);
NODE(ASR);
NODE(ASRW);
NODE(LSLLOOP);
NODE(LSRLOOP);
NODE(ROLLOOP);
Expand Down Expand Up @@ -279,6 +285,57 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
assert(isPowerOf2_32(VT.getSizeInBits()) &&
"Expected power-of-2 shift amount");

if (VT.getSizeInBits() == 32) {
if (!isa<ConstantSDNode>(N->getOperand(1))) {
// 32-bit shifts are converted to a loop in IR.
// This should be unreachable.
report_fatal_error("Expected a constant shift amount!");
}
SDVTList ResTys = DAG.getVTList(MVT::i16, MVT::i16);
SDValue SrcLo =
DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0),
DAG.getConstant(0, dl, MVT::i16));
SDValue SrcHi =
DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i16, Op.getOperand(0),
DAG.getConstant(1, dl, MVT::i16));
uint64_t ShiftAmount =
cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
if (ShiftAmount == 16) {
// Special case these two operations because they appear to be used by the
// generic codegen parts to lower 32-bit numbers.
// TODO: perhaps we can lower shift amounts bigger than 16 to a 16-bit
// shift of a part of the 32-bit value?
switch (Op.getOpcode()) {
case ISD::SHL: {
SDValue Zero = DAG.getConstant(0, dl, MVT::i16);
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Zero, SrcLo);
}
case ISD::SRL: {
SDValue Zero = DAG.getConstant(0, dl, MVT::i16);
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, SrcHi, Zero);
}
}
}
SDValue Cnt = DAG.getTargetConstant(ShiftAmount, dl, MVT::i8);
unsigned Opc;
switch (Op.getOpcode()) {
default:
llvm_unreachable("Invalid 32-bit shift opcode!");
case ISD::SHL:
Opc = AVRISD::LSLW;
break;
case ISD::SRL:
Opc = AVRISD::LSRW;
break;
case ISD::SRA:
Opc = AVRISD::ASRW;
break;
}
SDValue Result = DAG.getNode(Opc, dl, ResTys, SrcLo, SrcHi, Cnt);
return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i32, Result.getValue(0),
Result.getValue(1));
}

// Expand non-constant shifts to loops.
if (!isa<ConstantSDNode>(N->getOperand(1))) {
switch (Op.getOpcode()) {
Expand Down Expand Up @@ -1789,6 +1846,359 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
return RemBB;
}

// Do a multibyte AVR shift. Insert shift instructions and put the output
// registers in the Regs array.
// Because AVR does not have a normal shift instruction (only a single bit shift
// instruction), we have to emulate this behavior with other instructions.
// It first tries large steps (moving registers around) and then smaller steps
// like single bit shifts.
// Large shifts actually reduce the number of shifted registers, so the below
// algorithms have to work independently of the number of registers that are
// shifted.
// For more information and background, see this blogpost:
// https://aykevl.nl/2021/02/avr-bitshift
static void insertMultibyteShift(MachineInstr &MI, MachineBasicBlock *BB,
MutableArrayRef<std::pair<Register, int>> Regs,
ISD::NodeType Opc, int64_t ShiftAmt) {
const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
const AVRSubtarget &STI = BB->getParent()->getSubtarget<AVRSubtarget>();
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const DebugLoc &dl = MI.getDebugLoc();

const bool ShiftLeft = Opc == ISD::SHL;
const bool ArithmeticShift = Opc == ISD::SRA;

// Zero a register, for use in later operations.
Register ZeroReg = MRI.createVirtualRegister(&AVR::GPR8RegClass);
BuildMI(*BB, MI, dl, TII.get(AVR::COPY), ZeroReg)
.addReg(STI.getZeroRegister());

// Do a shift modulo 6 or 7. This is a bit more complicated than most shifts
// and is hard to compose with the rest, so these are special cased.
// The basic idea is to shift one or two bits in the opposite direction and
// then move registers around to get the correct end result.
if (ShiftLeft && (ShiftAmt % 8) >= 6) {
// Left shift modulo 6 or 7.

// Create a slice of the registers we're going to modify, to ease working
// with them.
size_t ShiftRegsOffset = ShiftAmt / 8;
size_t ShiftRegsSize = Regs.size() - ShiftRegsOffset;
MutableArrayRef<std::pair<Register, int>> ShiftRegs =
Regs.slice(ShiftRegsOffset, ShiftRegsSize);

// Shift one to the right, keeping the least significant bit as the carry
// bit.
insertMultibyteShift(MI, BB, ShiftRegs, ISD::SRL, 1);

// Rotate the least significant bit from the carry bit into a new register
// (that starts out zero).
Register LowByte = MRI.createVirtualRegister(&AVR::GPR8RegClass);
BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), LowByte).addReg(ZeroReg);

// Shift one more to the right if this is a modulo-6 shift.
if (ShiftAmt % 8 == 6) {
insertMultibyteShift(MI, BB, ShiftRegs, ISD::SRL, 1);
Register NewLowByte = MRI.createVirtualRegister(&AVR::GPR8RegClass);
BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), NewLowByte).addReg(LowByte);
LowByte = NewLowByte;
}

// Move all registers to the left, zeroing the bottom registers as needed.
for (size_t I = 0; I < Regs.size(); I++) {
int ShiftRegsIdx = I + 1;
if (ShiftRegsIdx < (int)ShiftRegs.size()) {
Regs[I] = ShiftRegs[ShiftRegsIdx];
} else if (ShiftRegsIdx == (int)ShiftRegs.size()) {
Regs[I] = std::pair(LowByte, 0);
} else {
Regs[I] = std::pair(ZeroReg, 0);
}
}

return;
}

// Right shift modulo 6 or 7.
if (!ShiftLeft && (ShiftAmt % 8) >= 6) {
// Create a view on the registers we're going to modify, to ease working
// with them.
size_t ShiftRegsSize = Regs.size() - (ShiftAmt / 8);
MutableArrayRef<std::pair<Register, int>> ShiftRegs =
Regs.slice(0, ShiftRegsSize);

// Shift one to the left.
insertMultibyteShift(MI, BB, ShiftRegs, ISD::SHL, 1);

// Sign or zero extend the most significant register into a new register.
// The HighByte is the byte that still has one (or two) bits from the
// original value. The ExtByte is purely a zero/sign extend byte (all bits
// are either 0 or 1).
Register HighByte = MRI.createVirtualRegister(&AVR::GPR8RegClass);
Register ExtByte = 0;
if (ArithmeticShift) {
// Sign-extend bit that was shifted out last.
BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), HighByte)
.addReg(HighByte, RegState::Undef)
.addReg(HighByte, RegState::Undef);
ExtByte = HighByte;
// The highest bit of the original value is the same as the zero-extend
// byte, so HighByte and ExtByte are the same.
} else {
// Use the zero register for zero extending.
ExtByte = ZeroReg;
// Rotate most significant bit into a new register (that starts out zero).
BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), HighByte)
.addReg(ExtByte)
.addReg(ExtByte);
}

// Shift one more to the left for modulo 6 shifts.
if (ShiftAmt % 8 == 6) {
insertMultibyteShift(MI, BB, ShiftRegs, ISD::SHL, 1);
// Shift the topmost bit into the HighByte.
Register NewExt = MRI.createVirtualRegister(&AVR::GPR8RegClass);
BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), NewExt)
.addReg(HighByte)
.addReg(HighByte);
HighByte = NewExt;
}

// Move all to the right, while sign or zero extending.
for (int I = Regs.size() - 1; I >= 0; I--) {
int ShiftRegsIdx = I - (Regs.size() - ShiftRegs.size()) - 1;
if (ShiftRegsIdx >= 0) {
Regs[I] = ShiftRegs[ShiftRegsIdx];
} else if (ShiftRegsIdx == -1) {
Regs[I] = std::pair(HighByte, 0);
} else {
Regs[I] = std::pair(ExtByte, 0);
}
}

return;
}

// For shift amounts of at least one register, simply rename the registers and
// zero the bottom registers.
while (ShiftLeft && ShiftAmt >= 8) {
// Move all registers one to the left.
for (size_t I = 0; I < Regs.size() - 1; I++) {
Regs[I] = Regs[I + 1];
}

// Zero the least significant register.
Regs[Regs.size() - 1] = std::pair(ZeroReg, 0);

// Continue shifts with the leftover registers.
Regs = Regs.drop_back(1);

ShiftAmt -= 8;
}

// And again, the same for right shifts.
Register ShrExtendReg = 0;
if (!ShiftLeft && ShiftAmt >= 8) {
if (ArithmeticShift) {
// Sign extend the most significant register into ShrExtendReg.
ShrExtendReg = MRI.createVirtualRegister(&AVR::GPR8RegClass);
Register Tmp = MRI.createVirtualRegister(&AVR::GPR8RegClass);
BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Tmp)
.addReg(Regs[0].first, 0, Regs[0].second)
.addReg(Regs[0].first, 0, Regs[0].second);
BuildMI(*BB, MI, dl, TII.get(AVR::SBCRdRr), ShrExtendReg)
.addReg(Tmp)
.addReg(Tmp);
} else {
ShrExtendReg = ZeroReg;
}
for (; ShiftAmt >= 8; ShiftAmt -= 8) {
// Move all registers one to the right.
for (size_t I = Regs.size() - 1; I != 0; I--) {
Regs[I] = Regs[I - 1];
}

// Zero or sign extend the most significant register.
Regs[0] = std::pair(ShrExtendReg, 0);

// Continue shifts with the leftover registers.
Regs = Regs.drop_front(1);
}
}

// The bigger shifts are already handled above.
assert((ShiftAmt < 8) && "Unexpect shift amount");

// Shift by four bits, using a complicated swap/eor/andi/eor sequence.
// It only works for logical shifts because the bits shifted in are all
// zeroes.
// To shift a single byte right, it produces code like this:
// swap r0
// andi r0, 0x0f
// For a two-byte (16-bit) shift, it adds the following instructions to shift
// the upper byte into the lower byte:
// swap r1
// eor r0, r1
// andi r1, 0x0f
// eor r0, r1
// For bigger shifts, it repeats the above sequence. For example, for a 3-byte
// (24-bit) shift it adds:
// swap r2
// eor r1, r2
// andi r2, 0x0f
// eor r1, r2
if (!ArithmeticShift && ShiftAmt >= 4) {
Register Prev = 0;
for (size_t I = 0; I < Regs.size(); I++) {
size_t Idx = ShiftLeft ? I : Regs.size() - I - 1;
Register SwapReg = MRI.createVirtualRegister(&AVR::LD8RegClass);
BuildMI(*BB, MI, dl, TII.get(AVR::SWAPRd), SwapReg)
.addReg(Regs[Idx].first, 0, Regs[Idx].second);
if (I != 0) {
Register R = MRI.createVirtualRegister(&AVR::GPR8RegClass);
BuildMI(*BB, MI, dl, TII.get(AVR::EORRdRr), R)
.addReg(Prev)
.addReg(SwapReg);
Prev = R;
}
Register AndReg = MRI.createVirtualRegister(&AVR::LD8RegClass);
BuildMI(*BB, MI, dl, TII.get(AVR::ANDIRdK), AndReg)
.addReg(SwapReg)
.addImm(ShiftLeft ? 0xf0 : 0x0f);
if (I != 0) {
Register R = MRI.createVirtualRegister(&AVR::GPR8RegClass);
BuildMI(*BB, MI, dl, TII.get(AVR::EORRdRr), R)
.addReg(Prev)
.addReg(AndReg);
size_t PrevIdx = ShiftLeft ? Idx - 1 : Idx + 1;
Regs[PrevIdx] = std::pair(R, 0);
}
Prev = AndReg;
Regs[Idx] = std::pair(AndReg, 0);
}
ShiftAmt -= 4;
}

// Shift by one. This is the fallback that always works, and the shift
// operation that is used for 1, 2, and 3 bit shifts.
while (ShiftLeft && ShiftAmt) {
// Shift one to the left.
for (ssize_t I = Regs.size() - 1; I >= 0; I--) {
Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass);
Register In = Regs[I].first;
Register InSubreg = Regs[I].second;
if (I == (ssize_t)Regs.size() - 1) { // first iteration
BuildMI(*BB, MI, dl, TII.get(AVR::ADDRdRr), Out)
.addReg(In, 0, InSubreg)
.addReg(In, 0, InSubreg);
} else {
BuildMI(*BB, MI, dl, TII.get(AVR::ADCRdRr), Out)
.addReg(In, 0, InSubreg)
.addReg(In, 0, InSubreg);
}
Regs[I] = std::pair(Out, 0);
}
ShiftAmt--;
}
while (!ShiftLeft && ShiftAmt) {
// Shift one to the right.
for (size_t I = 0; I < Regs.size(); I++) {
Register Out = MRI.createVirtualRegister(&AVR::GPR8RegClass);
Register In = Regs[I].first;
Register InSubreg = Regs[I].second;
if (I == 0) {
unsigned Opc = ArithmeticShift ? AVR::ASRRd : AVR::LSRRd;
BuildMI(*BB, MI, dl, TII.get(Opc), Out).addReg(In, 0, InSubreg);
} else {
BuildMI(*BB, MI, dl, TII.get(AVR::RORRd), Out).addReg(In, 0, InSubreg);
}
Regs[I] = std::pair(Out, 0);
}
ShiftAmt--;
}

if (ShiftAmt != 0) {
llvm_unreachable("don't know how to shift!"); // sanity check
}
}

// Do a wide (32-bit) shift.
MachineBasicBlock *
AVRTargetLowering::insertWideShift(MachineInstr &MI,
MachineBasicBlock *BB) const {
const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
const DebugLoc &dl = MI.getDebugLoc();

// How much to shift to the right (meaning: a negative number indicates a left
// shift).
int64_t ShiftAmt = MI.getOperand(4).getImm();
ISD::NodeType Opc;
switch (MI.getOpcode()) {
case AVR::Lsl32:
Opc = ISD::SHL;
break;
case AVR::Lsr32:
Opc = ISD::SRL;
break;
case AVR::Asr32:
Opc = ISD::SRA;
break;
}

// Read the input registers, with the most significant register at index 0.
std::array<std::pair<Register, int>, 4> Registers = {
std::pair(MI.getOperand(3).getReg(), AVR::sub_hi),
std::pair(MI.getOperand(3).getReg(), AVR::sub_lo),
std::pair(MI.getOperand(2).getReg(), AVR::sub_hi),
std::pair(MI.getOperand(2).getReg(), AVR::sub_lo),
};

// Do the shift. The registers are modified in-place.
insertMultibyteShift(MI, BB, Registers, Opc, ShiftAmt);

// Combine the 8-bit registers into 16-bit register pairs.
// This done either from LSB to MSB or from MSB to LSB, depending on the
// shift. It's an optimization so that the register allocator will use the
// fewest movs possible (which order we use isn't a correctness issue, just an
// optimization issue).
// - lsl prefers starting from the most significant byte (2nd case).
// - lshr prefers starting from the least significant byte (1st case).
// - for ashr it depends on the number of shifted bytes.
// Some shift operations still don't get the most optimal mov sequences even
// with this distinction. TODO: figure out why and try to fix it (but we're
// already equal to or faster than avr-gcc in all cases except ashr 8).
if (Opc != ISD::SHL &&
(Opc != ISD::SRA || (ShiftAmt < 16 || ShiftAmt >= 22))) {
// Use the resulting registers starting with the least significant byte.
BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg())
.addReg(Registers[3].first, 0, Registers[3].second)
.addImm(AVR::sub_lo)
.addReg(Registers[2].first, 0, Registers[2].second)
.addImm(AVR::sub_hi);
BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg())
.addReg(Registers[1].first, 0, Registers[1].second)
.addImm(AVR::sub_lo)
.addReg(Registers[0].first, 0, Registers[0].second)
.addImm(AVR::sub_hi);
} else {
// Use the resulting registers starting with the most significant byte.
BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(1).getReg())
.addReg(Registers[0].first, 0, Registers[0].second)
.addImm(AVR::sub_hi)
.addReg(Registers[1].first, 0, Registers[1].second)
.addImm(AVR::sub_lo);
BuildMI(*BB, MI, dl, TII.get(AVR::REG_SEQUENCE), MI.getOperand(0).getReg())
.addReg(Registers[2].first, 0, Registers[2].second)
.addImm(AVR::sub_hi)
.addReg(Registers[3].first, 0, Registers[3].second)
.addImm(AVR::sub_lo);
}

// Remove the pseudo instruction.
MI.eraseFromParent();
return BB;
}

static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
if (I->getOpcode() == AVR::COPY) {
Register SrcReg = I->getOperand(1).getReg();
Expand Down Expand Up @@ -1901,6 +2311,10 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case AVR::Asr8:
case AVR::Asr16:
return insertShift(MI, MBB);
case AVR::Lsl32:
case AVR::Lsr32:
case AVR::Asr32:
return insertWideShift(MI, MBB);
case AVR::MULRdRr:
case AVR::MULSRdRr:
return insertMul(MI, MBB);
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AVR/AVRISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,17 @@ enum NodeType {
LSLBN, ///< Byte logical shift left N bits.
LSLWN, ///< Word logical shift left N bits.
LSLHI, ///< Higher 8-bit of word logical shift left.
LSLW, ///< Wide logical shift left.
LSR, ///< Logical shift right.
LSRBN, ///< Byte logical shift right N bits.
LSRWN, ///< Word logical shift right N bits.
LSRLO, ///< Lower 8-bit of word logical shift right.
LSRW, ///< Wide logical shift right.
ASR, ///< Arithmetic shift right.
ASRBN, ///< Byte arithmetic shift right N bits.
ASRWN, ///< Word arithmetic shift right N bits.
ASRLO, ///< Lower 8-bit of word arithmetic shift right.
ASRW, ///< Wide arithmetic shift right.
ROR, ///< Bit rotate right.
ROL, ///< Bit rotate left.
LSLLOOP, ///< A loop of single logical shift left instructions.
Expand Down Expand Up @@ -186,6 +189,8 @@ class AVRTargetLowering : public TargetLowering {

private:
MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const;
MachineBasicBlock *insertWideShift(MachineInstr &MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const;
MachineBasicBlock *insertCopyZero(MachineInstr &MI,
MachineBasicBlock *BB) const;
Expand Down
18 changes: 18 additions & 0 deletions llvm/lib/Target/AVR/AVRInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ def AVRasrbn : SDNode<"AVRISD::ASRBN", SDTIntBinOp>;
def AVRlslwn : SDNode<"AVRISD::LSLWN", SDTIntBinOp>;
def AVRlsrwn : SDNode<"AVRISD::LSRWN", SDTIntBinOp>;
def AVRasrwn : SDNode<"AVRISD::ASRWN", SDTIntBinOp>;
def AVRlslw : SDNode<"AVRISD::LSLW", SDTIntShiftDOp>;
def AVRlsrw : SDNode<"AVRISD::LSRW", SDTIntShiftDOp>;
def AVRasrw : SDNode<"AVRISD::ASRW", SDTIntShiftDOp>;

// Pseudo shift nodes for non-constant shift amounts.
def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>;
Expand Down Expand Up @@ -2337,6 +2340,11 @@ def Lsl16 : ShiftPseudo<(outs DREGS
: $src, i8
: $cnt))]>;

def Lsl32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
(ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt),
"# Lsl32 PSEUDO",
[(set i16:$dstlo, i16:$dsthi, (AVRlslw i16:$srclo, i16:$srchi, i8:$cnt))]>;

def Lsr8 : ShiftPseudo<(outs GPR8
: $dst),
(ins GPR8
Expand All @@ -2357,6 +2365,11 @@ def Lsr16 : ShiftPseudo<(outs DREGS
: $src, i8
: $cnt))]>;

def Lsr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
(ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt),
"# Lsr32 PSEUDO",
[(set i16:$dstlo, i16:$dsthi, (AVRlsrw i16:$srclo, i16:$srchi, i8:$cnt))]>;

def Rol8 : ShiftPseudo<(outs GPR8
: $dst),
(ins GPR8
Expand Down Expand Up @@ -2417,6 +2430,11 @@ def Asr16 : ShiftPseudo<(outs DREGS
: $src, i8
: $cnt))]>;

def Asr32 : ShiftPseudo<(outs DREGS:$dstlo, DREGS:$dsthi),
(ins DREGS:$srclo, DREGS:$srchi, i8imm:$cnt),
"# Asr32 PSEUDO",
[(set i16:$dstlo, i16:$dsthi, (AVRasrw i16:$srclo, i16:$srchi, i8:$cnt))]>;

// lowered to a copy from the zero register.
let usesCustomInserter=1 in
def CopyZero : Pseudo<(outs GPR8:$rd), (ins), "clrz\t$rd", [(set i8:$rd, 0)]>;
Expand Down
577 changes: 577 additions & 0 deletions llvm/test/CodeGen/AVR/shift32.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,577 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=avr -mattr=movw -verify-machineinstrs | FileCheck %s

define i32 @shl_i32_1(i32 %a) {
; CHECK-LABEL: shl_i32_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsl r22
; CHECK-NEXT: rol r23
; CHECK-NEXT: rol r24
; CHECK-NEXT: rol r25
; CHECK-NEXT: ret
%res = shl i32 %a, 1
ret i32 %res
}

define i32 @shl_i32_2(i32 %a) {
; CHECK-LABEL: shl_i32_2:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsl r22
; CHECK-NEXT: rol r23
; CHECK-NEXT: rol r24
; CHECK-NEXT: rol r25
; CHECK-NEXT: lsl r22
; CHECK-NEXT: rol r23
; CHECK-NEXT: rol r24
; CHECK-NEXT: rol r25
; CHECK-NEXT: ret
%res = shl i32 %a, 2
ret i32 %res
}

define i32 @shl_i32_4(i32 %a) {
; CHECK-LABEL: shl_i32_4:
; CHECK: ; %bb.0:
; CHECK-NEXT: swap r25
; CHECK-NEXT: andi r25, 240
; CHECK-NEXT: swap r24
; CHECK-NEXT: eor r25, r24
; CHECK-NEXT: andi r24, 240
; CHECK-NEXT: eor r25, r24
; CHECK-NEXT: swap r23
; CHECK-NEXT: eor r24, r23
; CHECK-NEXT: andi r23, 240
; CHECK-NEXT: eor r24, r23
; CHECK-NEXT: swap r22
; CHECK-NEXT: eor r23, r22
; CHECK-NEXT: andi r22, 240
; CHECK-NEXT: eor r23, r22
; CHECK-NEXT: ret
%res = shl i32 %a, 4
ret i32 %res
}

; shift four bits and then shift one bit
define i32 @shl_i32_5(i32 %a) {
; CHECK-LABEL: shl_i32_5:
; CHECK: ; %bb.0:
; CHECK-NEXT: swap r25
; CHECK-NEXT: andi r25, 240
; CHECK-NEXT: swap r24
; CHECK-NEXT: eor r25, r24
; CHECK-NEXT: andi r24, 240
; CHECK-NEXT: eor r25, r24
; CHECK-NEXT: swap r23
; CHECK-NEXT: eor r24, r23
; CHECK-NEXT: andi r23, 240
; CHECK-NEXT: eor r24, r23
; CHECK-NEXT: swap r22
; CHECK-NEXT: eor r23, r22
; CHECK-NEXT: andi r22, 240
; CHECK-NEXT: eor r23, r22
; CHECK-NEXT: lsl r22
; CHECK-NEXT: rol r23
; CHECK-NEXT: rol r24
; CHECK-NEXT: rol r25
; CHECK-NEXT: ret
%res = shl i32 %a, 5
ret i32 %res
}

; shift two to the right and move the registers around
define i32 @shl_i32_6(i32 %a) {
; CHECK-LABEL: shl_i32_6:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: mov r18, r1
; CHECK-NEXT: ror r18
; CHECK-NEXT: lsr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: ror r18
; CHECK-NEXT: mov r25, r24
; CHECK-NEXT: mov r24, r23
; CHECK-NEXT: mov r19, r22
; CHECK-NEXT: movw r22, r18
; CHECK-NEXT: ret
%res = shl i32 %a, 6
ret i32 %res
}


; shift one to the right and move registers around
define i32 @shl_i32_7(i32 %a) {
; CHECK-LABEL: shl_i32_7:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: mov r18, r1
; CHECK-NEXT: ror r18
; CHECK-NEXT: mov r25, r24
; CHECK-NEXT: mov r24, r23
; CHECK-NEXT: mov r19, r22
; CHECK-NEXT: movw r22, r18
; CHECK-NEXT: ret
%res = shl i32 %a, 7
ret i32 %res
}

define i32 @shl_i32_8(i32 %a) {
; CHECK-LABEL: shl_i32_8:
; CHECK: ; %bb.0:
; CHECK-NEXT: mov r25, r24
; CHECK-NEXT: mov r24, r23
; CHECK-NEXT: mov r23, r22
; CHECK-NEXT: mov r22, r1
; CHECK-NEXT: ret
%res = shl i32 %a, 8
ret i32 %res
}

define i32 @shl_i32_9(i32 %a) {
; CHECK-LABEL: shl_i32_9:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsl r22
; CHECK-NEXT: rol r23
; CHECK-NEXT: rol r24
; CHECK-NEXT: mov r25, r24
; CHECK-NEXT: mov r24, r23
; CHECK-NEXT: mov r23, r22
; CHECK-NEXT: mov r22, r1
; CHECK-NEXT: ret
%res = shl i32 %a, 9
ret i32 %res
}

; shift 3 of 4 registers and move the others around
define i32 @shl_i32_12(i32 %a) {
; CHECK-LABEL: shl_i32_12:
; CHECK: ; %bb.0:
; CHECK-NEXT: swap r24
; CHECK-NEXT: andi r24, 240
; CHECK-NEXT: swap r23
; CHECK-NEXT: eor r24, r23
; CHECK-NEXT: andi r23, 240
; CHECK-NEXT: eor r24, r23
; CHECK-NEXT: swap r22
; CHECK-NEXT: eor r23, r22
; CHECK-NEXT: andi r22, 240
; CHECK-NEXT: eor r23, r22
; CHECK-NEXT: mov r25, r24
; CHECK-NEXT: mov r24, r23
; CHECK-NEXT: mov r23, r22
; CHECK-NEXT: mov r22, r1
; CHECK-NEXT: ret
%res = shl i32 %a, 12
ret i32 %res
}

define i32 @shl_i32_15(i32 %a) {
; CHECK-LABEL: shl_i32_15:
; CHECK: ; %bb.0:
; CHECK-NEXT: movw r18, r22
; CHECK-NEXT: lsr r24
; CHECK-NEXT: ror r19
; CHECK-NEXT: ror r18
; CHECK-NEXT: mov r23, r1
; CHECK-NEXT: ror r23
; CHECK-NEXT: mov r22, r1
; CHECK-NEXT: movw r24, r18
; CHECK-NEXT: ret
%res = shl i32 %a, 15
ret i32 %res
}

; This is a special case: this shift is performed directly inside SelectionDAG
; instead of as a custom lowering like the other shift operations.
define i32 @shl_i32_16(i32 %a) {
; CHECK-LABEL: shl_i32_16:
; CHECK: ; %bb.0:
; CHECK-NEXT: movw r24, r22
; CHECK-NEXT: ldi r22, 0
; CHECK-NEXT: ldi r23, 0
; CHECK-NEXT: ret
%res = shl i32 %a, 16
ret i32 %res
}

; Combined with the register allocator, shift instructions can sometimes be
; optimized away entirely. The least significant registers are simply stored
; directly instead of moving them first.
define void @shl_i32_16_ptr(i32 %a, ptr %ptr) {
; CHECK-LABEL: shl_i32_16_ptr:
; CHECK: ; %bb.0:
; CHECK-NEXT: movw r30, r20
; CHECK-NEXT: std Z+2, r22
; CHECK-NEXT: std Z+3, r23
; CHECK-NEXT: ldi r24, 0
; CHECK-NEXT: ldi r25, 0
; CHECK-NEXT: st Z, r24
; CHECK-NEXT: std Z+1, r25
; CHECK-NEXT: ret
%res = shl i32 %a, 16
store i32 %res, ptr %ptr
ret void
}

; shift only the most significant byte and then move it
define i32 @shl_i32_28(i32 %a) {
; CHECK-LABEL: shl_i32_28:
; CHECK: ; %bb.0:
; CHECK-NEXT: swap r22
; CHECK-NEXT: andi r22, 240
; CHECK-NEXT: mov r25, r22
; CHECK-NEXT: mov r24, r1
; CHECK-NEXT: mov r23, r1
; CHECK-NEXT: mov r22, r1
; CHECK-NEXT: ret
%res = shl i32 %a, 28
ret i32 %res
}

; move the rightmost bit to the leftmost bit and clear the rest
define i32 @shl_i32_31(i32 %a) {
; CHECK-LABEL: shl_i32_31:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsr r22
; CHECK-NEXT: mov r25, r1
; CHECK-NEXT: ror r25
; CHECK-NEXT: mov r24, r1
; CHECK-NEXT: mov r23, r1
; CHECK-NEXT: mov r22, r1
; CHECK-NEXT: ret
%res = shl i32 %a, 31
ret i32 %res
}

define i32 @lshr_i32_1(i32 %a) {
; CHECK-LABEL: lshr_i32_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: ret
%res = lshr i32 %a, 1
ret i32 %res
}

define i32 @lshr_i32_2(i32 %a) {
; CHECK-LABEL: lshr_i32_2:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: lsr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: ret
%res = lshr i32 %a, 2
ret i32 %res
}

define i32 @lshr_i32_4(i32 %a) {
; CHECK-LABEL: lshr_i32_4:
; CHECK: ; %bb.0:
; CHECK-NEXT: swap r22
; CHECK-NEXT: andi r22, 15
; CHECK-NEXT: swap r23
; CHECK-NEXT: eor r22, r23
; CHECK-NEXT: andi r23, 15
; CHECK-NEXT: eor r22, r23
; CHECK-NEXT: swap r24
; CHECK-NEXT: eor r23, r24
; CHECK-NEXT: andi r24, 15
; CHECK-NEXT: eor r23, r24
; CHECK-NEXT: swap r25
; CHECK-NEXT: eor r24, r25
; CHECK-NEXT: andi r25, 15
; CHECK-NEXT: eor r24, r25
; CHECK-NEXT: ret
%res = lshr i32 %a, 4
ret i32 %res
}

define i32 @lshr_i32_6(i32 %a) {
; CHECK-LABEL: lshr_i32_6:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsl r22
; CHECK-NEXT: rol r23
; CHECK-NEXT: rol r24
; CHECK-NEXT: rol r25
; CHECK-NEXT: mov r19, r1
; CHECK-NEXT: rol r19
; CHECK-NEXT: lsl r22
; CHECK-NEXT: rol r23
; CHECK-NEXT: rol r24
; CHECK-NEXT: rol r25
; CHECK-NEXT: rol r19
; CHECK-NEXT: mov r22, r23
; CHECK-NEXT: mov r23, r24
; CHECK-NEXT: mov r18, r25
; CHECK-NEXT: movw r24, r18
; CHECK-NEXT: ret
%res = lshr i32 %a, 6
ret i32 %res
}

define i32 @lshr_i32_7(i32 %a) {
; CHECK-LABEL: lshr_i32_7:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsl r22
; CHECK-NEXT: rol r23
; CHECK-NEXT: rol r24
; CHECK-NEXT: rol r25
; CHECK-NEXT: mov r19, r1
; CHECK-NEXT: rol r19
; CHECK-NEXT: mov r22, r23
; CHECK-NEXT: mov r23, r24
; CHECK-NEXT: mov r18, r25
; CHECK-NEXT: movw r24, r18
; CHECK-NEXT: ret
%res = lshr i32 %a, 7
ret i32 %res
}

define i32 @lshr_i32_8(i32 %a) {
; CHECK-LABEL: lshr_i32_8:
; CHECK: ; %bb.0:
; CHECK-NEXT: mov r22, r23
; CHECK-NEXT: mov r23, r24
; CHECK-NEXT: mov r24, r25
; CHECK-NEXT: mov r25, r1
; CHECK-NEXT: ret
%res = lshr i32 %a, 8
ret i32 %res
}

define i32 @lshr_i32_9(i32 %a) {
; CHECK-LABEL: lshr_i32_9:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: mov r22, r23
; CHECK-NEXT: mov r23, r24
; CHECK-NEXT: mov r24, r25
; CHECK-NEXT: mov r25, r1
; CHECK-NEXT: ret
%res = lshr i32 %a, 9
ret i32 %res
}

define i32 @lshr_i32_16(i32 %a) {
; CHECK-LABEL: lshr_i32_16:
; CHECK: ; %bb.0:
; CHECK-NEXT: movw r22, r24
; CHECK-NEXT: ldi r24, 0
; CHECK-NEXT: ldi r25, 0
; CHECK-NEXT: ret
%res = lshr i32 %a, 16
ret i32 %res
}

define i32 @lshr_i32_24(i32 %a) {
; CHECK-LABEL: lshr_i32_24:
; CHECK: ; %bb.0:
; CHECK-NEXT: mov r22, r25
; CHECK-NEXT: mov r23, r1
; CHECK-NEXT: mov r24, r1
; CHECK-NEXT: mov r25, r1
; CHECK-NEXT: ret
%res = lshr i32 %a, 24
ret i32 %res
}

define i32 @lshr_i32_31(i32 %a) {
; CHECK-LABEL: lshr_i32_31:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsl r25
; CHECK-NEXT: mov r22, r1
; CHECK-NEXT: rol r22
; CHECK-NEXT: mov r23, r1
; CHECK-NEXT: mov r24, r1
; CHECK-NEXT: mov r25, r1
; CHECK-NEXT: ret
%res = lshr i32 %a, 31
ret i32 %res
}

define i32 @ashr_i32_1(i32 %a) {
; CHECK-LABEL: ashr_i32_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: asr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: ret
%res = ashr i32 %a, 1
ret i32 %res
}

define i32 @ashr_i32_2(i32 %a) {
; CHECK-LABEL: ashr_i32_2:
; CHECK: ; %bb.0:
; CHECK-NEXT: asr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: asr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: ret
%res = ashr i32 %a, 2
ret i32 %res
}

; can't use the swap/andi/eor trick here
define i32 @ashr_i32_4(i32 %a) {
; CHECK-LABEL: ashr_i32_4:
; CHECK: ; %bb.0:
; CHECK-NEXT: asr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: asr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: asr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: asr r25
; CHECK-NEXT: ror r24
; CHECK-NEXT: ror r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: ret
%res = ashr i32 %a, 4
ret i32 %res
}

define i32 @ashr_i32_7(i32 %a) {
; CHECK-LABEL: ashr_i32_7:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsl r22
; CHECK-NEXT: rol r23
; CHECK-NEXT: rol r24
; CHECK-NEXT: rol r25
; CHECK-NEXT: sbc r19, r19
; CHECK-NEXT: mov r22, r23
; CHECK-NEXT: mov r23, r24
; CHECK-NEXT: mov r18, r25
; CHECK-NEXT: movw r24, r18
; CHECK-NEXT: ret
%res = ashr i32 %a, 7
ret i32 %res
}

; TODO: this could be optimized to 4 movs, instead of 5.
define i32 @ashr_i32_8(i32 %a) {
; CHECK-LABEL: ashr_i32_8:
; CHECK: ; %bb.0:
; CHECK-NEXT: mov r19, r25
; CHECK-NEXT: lsl r19
; CHECK-NEXT: sbc r19, r19
; CHECK-NEXT: mov r22, r23
; CHECK-NEXT: mov r23, r24
; CHECK-NEXT: mov r18, r25
; CHECK-NEXT: movw r24, r18
; CHECK-NEXT: ret
%res = ashr i32 %a, 8
ret i32 %res
}

define i32 @ashr_i32_16(i32 %a) {
; CHECK-LABEL: ashr_i32_16:
; CHECK: ; %bb.0:
; CHECK-NEXT: movw r22, r24
; CHECK-NEXT: lsl r25
; CHECK-NEXT: sbc r25, r25
; CHECK-NEXT: mov r24, r25
; CHECK-NEXT: ret
%res = ashr i32 %a, 16
ret i32 %res
}

define i32 @ashr_i32_17(i32 %a) {
; CHECK-LABEL: ashr_i32_17:
; CHECK: ; %bb.0:
; CHECK-NEXT: movw r22, r24
; CHECK-NEXT: lsl r25
; CHECK-NEXT: sbc r25, r25
; CHECK-NEXT: asr r23
; CHECK-NEXT: ror r22
; CHECK-NEXT: mov r24, r25
; CHECK-NEXT: ret
%res = ashr i32 %a, 17
ret i32 %res
}

define i32 @ashr_i32_22(i32 %a) {
; CHECK-LABEL: ashr_i32_22:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsl r24
; CHECK-NEXT: rol r25
; CHECK-NEXT: sbc r18, r18
; CHECK-NEXT: lsl r24
; CHECK-NEXT: rol r25
; CHECK-NEXT: mov r23, r18
; CHECK-NEXT: rol r23
; CHECK-NEXT: mov r22, r25
; CHECK-NEXT: mov r19, r18
; CHECK-NEXT: movw r24, r18
; CHECK-NEXT: ret
%res = ashr i32 %a, 22
ret i32 %res
}

define i32 @ashr_i32_23(i32 %a) {
; CHECK-LABEL: ashr_i32_23:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsl r24
; CHECK-NEXT: rol r25
; CHECK-NEXT: sbc r23, r23
; CHECK-NEXT: mov r22, r25
; CHECK-NEXT: mov r24, r23
; CHECK-NEXT: mov r25, r23
; CHECK-NEXT: ret
%res = ashr i32 %a, 23
ret i32 %res
}

define i32 @ashr_i32_30(i32 %a) {
; CHECK-LABEL: ashr_i32_30:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsl r25
; CHECK-NEXT: sbc r23, r23
; CHECK-NEXT: lsl r25
; CHECK-NEXT: mov r22, r23
; CHECK-NEXT: rol r22
; CHECK-NEXT: mov r24, r23
; CHECK-NEXT: mov r25, r23
; CHECK-NEXT: ret
%res = ashr i32 %a, 30
ret i32 %res
}

define i32 @ashr_i32_31(i32 %a) {
; CHECK-LABEL: ashr_i32_31:
; CHECK: ; %bb.0:
; CHECK-NEXT: lsl r25
; CHECK-NEXT: sbc r22, r22
; CHECK-NEXT: mov r23, r22
; CHECK-NEXT: movw r24, r22
; CHECK-NEXT: ret
%res = ashr i32 %a, 31
ret i32 %res
}