446 changes: 446 additions & 0 deletions llvm/lib/Target/X86/ImmutableGraph.h

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion llvm/lib/Target/X86/X86.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ FunctionPass *createX86DomainReassignmentPass();
FunctionPass *createX86EvexToVexInsts();

/// This pass creates the thunks for the retpoline feature.
FunctionPass *createX86RetpolineThunksPass();
FunctionPass *createX86IndirectThunksPass();

/// This pass ensures instructions featuring a memory operand
/// have distinctive <LineNumber, Discriminator> (with respect to eachother)
Expand All @@ -133,6 +133,9 @@ InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
X86Subtarget &,
X86RegisterBankInfo &);

FunctionPass *createX86LoadValueInjectionLoadHardeningPass();
FunctionPass *createX86LoadValueInjectionLoadHardeningUnoptimizedPass();
FunctionPass *createX86LoadValueInjectionRetHardeningPass();
FunctionPass *createX86SpeculativeLoadHardeningPass();

void initializeEvexToVexInstPassPass(PassRegistry &);
Expand All @@ -148,6 +151,9 @@ void initializeX86DomainReassignmentPass(PassRegistry &);
void initializeX86ExecutionDomainFixPass(PassRegistry &);
void initializeX86ExpandPseudoPass(PassRegistry &);
void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
void initializeX86LoadValueInjectionLoadHardeningUnoptimizedPassPass(PassRegistry &);
void initializeX86LoadValueInjectionLoadHardeningPassPass(PassRegistry &);
void initializeX86LoadValueInjectionRetHardeningPassPass(PassRegistry &);
void initializeX86OptimizeLEAPassPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);

Expand Down
16 changes: 16 additions & 0 deletions llvm/lib/Target/X86/X86.td
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,22 @@ def FeatureRetpolineExternalThunk
"ourselves. Only has effect when combined with some other retpoline "
"feature", [FeatureRetpolineIndirectCalls]>;

// Mitigate LVI attacks against indirect calls/branches and call returns
def FeatureLVIControlFlowIntegrity
: SubtargetFeature<
"lvi-cfi", "UseLVIControlFlowIntegrity", "true",
"Prevent indirect calls/branches from using a memory operand, and "
"precede all indirect calls/branches from a register with an "
"LFENCE instruction to serialize control flow. Also decompose RET "
"instructions into a POP+LFENCE+JMP sequence.">;

// Mitigate LVI attacks against data loads
def FeatureLVILoadHardening
: SubtargetFeature<
"lvi-load-hardening", "UseLVILoadHardening", "true",
"Insert LFENCE instructions to prevent data speculatively injected "
"into loads from being used maliciously.">;

// Direct Move instructions.
def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
"Support movdiri instruction">;
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/X86/X86FastISel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3202,8 +3202,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
(CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers")))
return false;

// Functions using retpoline for indirect calls need to use SDISel.
if (Subtarget->useRetpolineIndirectCalls())
// Functions using thunks for indirect calls need to use SDISel.
if (Subtarget->useIndirectThunkCalls())
return false;

// Handle only C, fastcc, and webkit_js calling conventions for now.
Expand Down
10 changes: 5 additions & 5 deletions llvm/lib/Target/X86/X86FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -765,10 +765,10 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
bool InProlog) const {
bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;

// FIXME: Add retpoline support and remove this.
if (Is64Bit && IsLargeCodeModel && STI.useRetpolineIndirectCalls())
// FIXME: Add indirect thunk support and remove this.
if (Is64Bit && IsLargeCodeModel && STI.useIndirectThunkCalls())
report_fatal_error("Emitting stack probe calls on 64-bit with the large "
"code model and retpoline not yet implemented.");
"code model and indirect thunks not yet implemented.");

unsigned CallOp;
if (Is64Bit)
Expand Down Expand Up @@ -2493,9 +2493,9 @@ void X86FrameLowering::adjustForSegmentedStacks(
// is laid out within 2^31 bytes of each function body, but this seems
// to be sufficient for JIT.
// FIXME: Add retpoline support and remove the error here..
if (STI.useRetpolineIndirectCalls())
if (STI.useIndirectThunkCalls())
report_fatal_error("Emitting morestack calls on 64-bit with the large "
"code model and retpoline not yet implemented.");
"code model and thunks not yet implemented.");
BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
.addReg(X86::RIP)
.addImm(0)
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -987,7 +987,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
if (OptLevel != CodeGenOpt::None &&
// Only do this when the target can fold the load into the call or
// jmp.
!Subtarget->useRetpolineIndirectCalls() &&
!Subtarget->useIndirectThunkCalls() &&
((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
(N->getOpcode() == X86ISD::TC_RETURN &&
(Subtarget->is64Bit() ||
Expand Down
81 changes: 45 additions & 36 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30221,8 +30221,8 @@ bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
}

bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
// If the subtarget is using retpolines, we need to not generate jump tables.
if (Subtarget.useRetpolineIndirectBranches())
// If the subtarget is using thunks, we need to not generate jump tables.
if (Subtarget.useIndirectThunkBranches())
return false;

// Otherwise, fallback on the generic logic.
Expand Down Expand Up @@ -31345,22 +31345,22 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
return BB;
}

static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
switch (RPOpc) {
case X86::RETPOLINE_CALL32:
case X86::INDIRECT_THUNK_CALL32:
return X86::CALLpcrel32;
case X86::RETPOLINE_CALL64:
case X86::INDIRECT_THUNK_CALL64:
return X86::CALL64pcrel32;
case X86::RETPOLINE_TCRETURN32:
case X86::INDIRECT_THUNK_TCRETURN32:
return X86::TCRETURNdi;
case X86::RETPOLINE_TCRETURN64:
case X86::INDIRECT_THUNK_TCRETURN64:
return X86::TCRETURNdi64;
}
llvm_unreachable("not retpoline opcode");
llvm_unreachable("not indirect thunk opcode");
}

static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
unsigned Reg) {
static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
unsigned Reg) {
if (Subtarget.useRetpolineExternalThunk()) {
// When using an external thunk for retpolines, we pick names that match the
// names GCC happens to use as well. This helps simplify the implementation
Expand Down Expand Up @@ -31392,39 +31392,48 @@ static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
return "__x86_indirect_thunk_r11";
}
llvm_unreachable("unexpected reg for external indirect thunk");
}

if (Subtarget.useRetpolineIndirectCalls() ||
Subtarget.useRetpolineIndirectBranches()) {
// When targeting an internal COMDAT thunk use an LLVM-specific name.
switch (Reg) {
case X86::EAX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_eax";
case X86::ECX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_ecx";
case X86::EDX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_edx";
case X86::EDI:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_edi";
case X86::R11:
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
return "__llvm_retpoline_r11";
}
llvm_unreachable("unexpected reg for retpoline");
}

// When targeting an internal COMDAT thunk use an LLVM-specific name.
switch (Reg) {
case X86::EAX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_eax";
case X86::ECX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_ecx";
case X86::EDX:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_edx";
case X86::EDI:
assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
return "__llvm_retpoline_edi";
case X86::R11:
if (Subtarget.useLVIControlFlowIntegrity()) {
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
return "__llvm_retpoline_r11";
return "__llvm_lvi_thunk_r11";
}
llvm_unreachable("unexpected reg for retpoline");
llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
}

MachineBasicBlock *
X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
MachineBasicBlock *BB) const {
X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
MachineBasicBlock *BB) const {
// Copy the virtual register into the R11 physical register and
// call the retpoline thunk.
DebugLoc DL = MI.getDebugLoc();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
Register CalleeVReg = MI.getOperand(0).getReg();
unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());

// Find an available scratch register to hold the callee. On 64-bit, we can
// just use R11, but we scan for uses anyway to ensure we don't generate
Expand Down Expand Up @@ -31458,7 +31467,7 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
report_fatal_error("calling convention incompatible with retpoline, no "
"available registers");

const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
.addReg(CalleeVReg);
Expand Down Expand Up @@ -32234,11 +32243,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::TLS_base_addr32:
case X86::TLS_base_addr64:
return EmitLoweredTLSAddr(MI, BB);
case X86::RETPOLINE_CALL32:
case X86::RETPOLINE_CALL64:
case X86::RETPOLINE_TCRETURN32:
case X86::RETPOLINE_TCRETURN64:
return EmitLoweredRetpoline(MI, BB);
case X86::INDIRECT_THUNK_CALL32:
case X86::INDIRECT_THUNK_CALL64:
case X86::INDIRECT_THUNK_TCRETURN32:
case X86::INDIRECT_THUNK_TCRETURN64:
return EmitLoweredIndirectThunk(MI, BB);
case X86::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
case X86::CATCHPAD:
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1482,8 +1482,8 @@ namespace llvm {
MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
MachineBasicBlock *BB) const;

MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
MachineBasicBlock *BB) const;

MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
MachineBasicBlock *MBB) const;
Expand Down
364 changes: 364 additions & 0 deletions llvm/lib/Target/X86/X86IndirectThunks.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,364 @@
//==- X86IndirectThunks.cpp - Construct indirect call/jump thunks for x86 --=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
///
/// Pass that injects an MI thunk that is used to lower indirect calls in a way
/// that prevents speculation on some x86 processors and can be used to mitigate
/// security vulnerabilities due to targeted speculative execution and side
/// channels such as CVE-2017-5715.
///
/// Currently supported thunks include:
/// - Retpoline -- A RET-implemented trampoline that lowers indirect calls
/// - LVI Thunk -- A CALL/JMP-implemented thunk that forces load serialization
/// before making an indirect call/jump
///
/// Note that the reason that this is implemented as a MachineFunctionPass and
/// not a ModulePass is that ModulePasses at this point in the LLVM X86 pipeline
/// serialize all transformations, which can consume lots of memory.
///
/// TODO(chandlerc): All of this code could use better comments and
/// documentation.
///
//===----------------------------------------------------------------------===//

#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"

using namespace llvm;

#define DEBUG_TYPE "x86-retpoline-thunks"

static const char RetpolineNamePrefix[] = "__llvm_retpoline_";
static const char R11RetpolineName[] = "__llvm_retpoline_r11";
static const char EAXRetpolineName[] = "__llvm_retpoline_eax";
static const char ECXRetpolineName[] = "__llvm_retpoline_ecx";
static const char EDXRetpolineName[] = "__llvm_retpoline_edx";
static const char EDIRetpolineName[] = "__llvm_retpoline_edi";

static const char LVIThunkNamePrefix[] = "__llvm_lvi_thunk_";
static const char R11LVIThunkName[] = "__llvm_lvi_thunk_r11";

namespace {
template <typename Derived> class ThunkInserter {
Derived &getDerived() { return *static_cast<Derived *>(this); }

protected:
bool InsertedThunks;
void doInitialization(Module &M) {}
void createThunkFunction(MachineModuleInfo &MMI, StringRef Name);

public:
void init(Module &M) {
InsertedThunks = false;
getDerived().doInitialization(M);
}
// return `true` if `MMI` or `MF` was modified
bool run(MachineModuleInfo &MMI, MachineFunction &MF);
};

struct RetpolineThunkInserter : ThunkInserter<RetpolineThunkInserter> {
const char *getThunkPrefix() { return RetpolineNamePrefix; }
bool mayUseThunk(const MachineFunction &MF) {
const auto &STI = MF.getSubtarget<X86Subtarget>();
return (STI.useRetpolineIndirectCalls() ||
STI.useRetpolineIndirectBranches()) &&
!STI.useRetpolineExternalThunk();
}
void insertThunks(MachineModuleInfo &MMI);
void populateThunk(MachineFunction &MF);
};

struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> {
const char *getThunkPrefix() { return LVIThunkNamePrefix; }
bool mayUseThunk(const MachineFunction &MF) {
return MF.getSubtarget<X86Subtarget>().useLVIControlFlowIntegrity();
}
void insertThunks(MachineModuleInfo &MMI) {
createThunkFunction(MMI, R11LVIThunkName);
}
void populateThunk(MachineFunction &MF) {
// Grab the entry MBB and erase any other blocks. O0 codegen appears to
// generate two bbs for the entry block.
MachineBasicBlock *Entry = &MF.front();
Entry->clear();
while (MF.size() > 1)
MF.erase(std::next(MF.begin()));

// This code mitigates LVI by replacing each indirect call/jump with a
// direct call/jump to a thunk that looks like:
// ```
// lfence
// jmpq *%r11
// ```
// This ensures that if the value in register %r11 was loaded from memory,
// then the value in %r11 is (architecturally) correct prior to the jump.
const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
BuildMI(&MF.front(), DebugLoc(), TII->get(X86::LFENCE));
BuildMI(&MF.front(), DebugLoc(), TII->get(X86::JMP64r)).addReg(X86::R11);
MF.front().addLiveIn(X86::R11);
return;
}
};

class X86IndirectThunks : public MachineFunctionPass {
public:
static char ID;

X86IndirectThunks() : MachineFunctionPass(ID) {}

StringRef getPassName() const override { return "X86 Indirect Thunks"; }

bool doInitialization(Module &M) override;
bool runOnMachineFunction(MachineFunction &MF) override;

void getAnalysisUsage(AnalysisUsage &AU) const override {
MachineFunctionPass::getAnalysisUsage(AU);
AU.addRequired<MachineModuleInfoWrapperPass>();
AU.addPreserved<MachineModuleInfoWrapperPass>();
}

private:
std::tuple<RetpolineThunkInserter, LVIThunkInserter> TIs;

// FIXME: When LLVM moves to C++17, these can become folds
template <typename... ThunkInserterT>
static void initTIs(Module &M,
std::tuple<ThunkInserterT...> &ThunkInserters) {
(void)std::initializer_list<int>{
(std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...};
}
template <typename... ThunkInserterT>
static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
std::tuple<ThunkInserterT...> &ThunkInserters) {
bool Modified = false;
(void)std::initializer_list<int>{
Modified |= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...};
return Modified;
}
};

} // end anonymous namespace

void RetpolineThunkInserter::insertThunks(MachineModuleInfo &MMI) {
if (MMI.getTarget().getTargetTriple().getArch() == Triple::x86_64)
createThunkFunction(MMI, R11RetpolineName);
else
for (StringRef Name : {EAXRetpolineName, ECXRetpolineName, EDXRetpolineName,
EDIRetpolineName})
createThunkFunction(MMI, Name);
}

void RetpolineThunkInserter::populateThunk(MachineFunction &MF) {
bool Is64Bit = MF.getTarget().getTargetTriple().getArch() == Triple::x86_64;
Register ThunkReg;
if (Is64Bit) {
assert(MF.getName() == "__llvm_retpoline_r11" &&
"Should only have an r11 thunk on 64-bit targets");

// __llvm_retpoline_r11:
// callq .Lr11_call_target
// .Lr11_capture_spec:
// pause
// lfence
// jmp .Lr11_capture_spec
// .align 16
// .Lr11_call_target:
// movq %r11, (%rsp)
// retq
ThunkReg = X86::R11;
} else {
// For 32-bit targets we need to emit a collection of thunks for various
// possible scratch registers as well as a fallback that uses EDI, which is
// normally callee saved.
// __llvm_retpoline_eax:
// calll .Leax_call_target
// .Leax_capture_spec:
// pause
// jmp .Leax_capture_spec
// .align 16
// .Leax_call_target:
// movl %eax, (%esp) # Clobber return addr
// retl
//
// __llvm_retpoline_ecx:
// ... # Same setup
// movl %ecx, (%esp)
// retl
//
// __llvm_retpoline_edx:
// ... # Same setup
// movl %edx, (%esp)
// retl
//
// __llvm_retpoline_edi:
// ... # Same setup
// movl %edi, (%esp)
// retl
if (MF.getName() == EAXRetpolineName)
ThunkReg = X86::EAX;
else if (MF.getName() == ECXRetpolineName)
ThunkReg = X86::ECX;
else if (MF.getName() == EDXRetpolineName)
ThunkReg = X86::EDX;
else if (MF.getName() == EDIRetpolineName)
ThunkReg = X86::EDI;
else
llvm_unreachable("Invalid thunk name on x86-32!");
}

const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
// Grab the entry MBB and erase any other blocks. O0 codegen appears to
// generate two bbs for the entry block.
MachineBasicBlock *Entry = &MF.front();
Entry->clear();
while (MF.size() > 1)
MF.erase(std::next(MF.begin()));

MachineBasicBlock *CaptureSpec =
MF.CreateMachineBasicBlock(Entry->getBasicBlock());
MachineBasicBlock *CallTarget =
MF.CreateMachineBasicBlock(Entry->getBasicBlock());
MCSymbol *TargetSym = MF.getContext().createTempSymbol();
MF.push_back(CaptureSpec);
MF.push_back(CallTarget);

const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;

Entry->addLiveIn(ThunkReg);
BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym);

// The MIR verifier thinks that the CALL in the entry block will fall through
// to CaptureSpec, so mark it as the successor. Technically, CaptureTarget is
// the successor, but the MIR verifier doesn't know how to cope with that.
Entry->addSuccessor(CaptureSpec);

// In the capture loop for speculation, we want to stop the processor from
// speculating as fast as possible. On Intel processors, the PAUSE instruction
// will block speculation without consuming any execution resources. On AMD
// processors, the PAUSE instruction is (essentially) a nop, so we also use an
// LFENCE instruction which they have advised will stop speculation as well
// with minimal resource utilization. We still end the capture with a jump to
// form an infinite loop to fully guarantee that no matter what implementation
// of the x86 ISA, speculating this code path never escapes.
BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE));
BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE));
BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec);
CaptureSpec->setHasAddressTaken();
CaptureSpec->addSuccessor(CaptureSpec);

CallTarget->addLiveIn(ThunkReg);
CallTarget->setHasAddressTaken();
CallTarget->setAlignment(Align(16));

// Insert return address clobber
const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
const Register SPReg = Is64Bit ? X86::RSP : X86::ESP;
addRegOffset(BuildMI(CallTarget, DebugLoc(), TII->get(MovOpc)), SPReg, false,
0)
.addReg(ThunkReg);

CallTarget->back().setPreInstrSymbol(MF, TargetSym);
BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
}

template <typename Derived>
void ThunkInserter<Derived>::createThunkFunction(MachineModuleInfo &MMI,
StringRef Name) {
assert(Name.startswith(getDerived().getThunkPrefix()) &&
"Created a thunk with an unexpected prefix!");

Module &M = const_cast<Module &>(*MMI.getModule());
LLVMContext &Ctx = M.getContext();
auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
Function *F =
Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M);
F->setVisibility(GlobalValue::HiddenVisibility);
F->setComdat(M.getOrInsertComdat(Name));

// Add Attributes so that we don't create a frame, unwind information, or
// inline.
AttrBuilder B;
B.addAttribute(llvm::Attribute::NoUnwind);
B.addAttribute(llvm::Attribute::Naked);
F->addAttributes(llvm::AttributeList::FunctionIndex, B);

// Populate our function a bit so that we can verify.
BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
IRBuilder<> Builder(Entry);

Builder.CreateRetVoid();

// MachineFunctions/MachineBasicBlocks aren't created automatically for the
// IR-level constructs we already made. Create them and insert them into the
// module.
MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
MachineBasicBlock *EntryMBB = MF.CreateMachineBasicBlock(Entry);

// Insert EntryMBB into MF. It's not in the module until we do this.
MF.insert(MF.end(), EntryMBB);
// Set MF properties. We never use vregs...
MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
}

template <typename Derived>
bool ThunkInserter<Derived>::run(MachineModuleInfo &MMI, MachineFunction &MF) {
// If MF is not a thunk, check to see if we need to insert a thunk.
if (!MF.getName().startswith(getDerived().getThunkPrefix())) {
// If we've already inserted a thunk, nothing else to do.
if (InsertedThunks)
return false;

// Only add a thunk if one of the functions has the corresponding feature
// enabled in its subtarget, and doesn't enable external thunks.
// FIXME: Conditionalize on indirect calls so we don't emit a thunk when
// nothing will end up calling it.
// FIXME: It's a little silly to look at every function just to enumerate
// the subtargets, but eventually we'll want to look at them for indirect
// calls, so maybe this is OK.
if (!getDerived().mayUseThunk(MF))
return false;

getDerived().insertThunks(MMI);
InsertedThunks = true;
return true;
}

// If this *is* a thunk function, we need to populate it with the correct MI.
getDerived().populateThunk(MF);
return true;
}

FunctionPass *llvm::createX86IndirectThunksPass() {
return new X86IndirectThunks();
}

char X86IndirectThunks::ID = 0;

bool X86IndirectThunks::doInitialization(Module &M) {
initTIs(M, TIs);
return false;
}

bool X86IndirectThunks::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << getPassName() << '\n');
auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
return runTIs(MMI, MF, TIs);
}
16 changes: 8 additions & 8 deletions llvm/lib/Target/X86/X86InstrCompiler.td
Original file line number Diff line number Diff line change
Expand Up @@ -1213,14 +1213,14 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),

def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
(TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
Requires<[Not64BitMode, NotUseRetpolineIndirectCalls]>;
Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;

// FIXME: This is disabled for 32-bit PIC mode because the global base
// register which is part of the address mode may be assigned a
// callee-saved register.
def : Pat<(X86tcret (load addr:$dst), imm:$off),
(TCRETURNmi addr:$dst, imm:$off)>,
Requires<[Not64BitMode, IsNotPIC, NotUseRetpolineIndirectCalls]>;
Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>;

def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
(TCRETURNdi tglobaladdr:$dst, imm:$off)>,
Expand All @@ -1232,21 +1232,21 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),

def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
(TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
Requires<[In64BitMode, NotUseRetpolineIndirectCalls]>;
Requires<[In64BitMode, NotUseIndirectThunkCalls]>;

// Don't fold loads into X86tcret requiring more than 6 regs.
// There wouldn't be enough scratch registers for base+index.
def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
(TCRETURNmi64 addr:$dst, imm:$off)>,
Requires<[In64BitMode, NotUseRetpolineIndirectCalls]>;
Requires<[In64BitMode, NotUseIndirectThunkCalls]>;

def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
(RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
Requires<[In64BitMode, UseRetpolineIndirectCalls]>;
(INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
Requires<[In64BitMode, UseIndirectThunkCalls]>;

def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
(RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
Requires<[Not64BitMode, UseRetpolineIndirectCalls]>;
(INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
Requires<[Not64BitMode, UseIndirectThunkCalls]>;

def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
(TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
Expand Down
22 changes: 11 additions & 11 deletions llvm/lib/Target/X86/X86InstrControl.td
Original file line number Diff line number Diff line change
Expand Up @@ -237,13 +237,13 @@ let isCall = 1 in
Sched<[WriteJumpLd]>;
def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
"call{l}\t{*}$dst", [(X86call GR32:$dst)]>, OpSize32,
Requires<[Not64BitMode,NotUseRetpolineIndirectCalls]>,
Requires<[Not64BitMode,NotUseIndirectThunkCalls]>,
Sched<[WriteJump]>;
def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
"call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>,
OpSize32,
Requires<[Not64BitMode,FavorMemIndirectCall,
NotUseRetpolineIndirectCalls]>,
NotUseIndirectThunkCalls]>,
Sched<[WriteJumpLd]>;

// Non-tracking calls for IBT, use with caution.
Expand Down Expand Up @@ -334,11 +334,11 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
Requires<[In64BitMode]>;
def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
"call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
Requires<[In64BitMode,NotUseRetpolineIndirectCalls]>;
Requires<[In64BitMode,NotUseIndirectThunkCalls]>;
def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
"call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
Requires<[In64BitMode,FavorMemIndirectCall,
NotUseRetpolineIndirectCalls]>;
NotUseIndirectThunkCalls]>;

// Non-tracking calls for IBT, use with caution.
let isCodeGenOnly = 1 in {
Expand Down Expand Up @@ -393,19 +393,19 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
Uses = [RSP, SSP],
usesCustomInserter = 1,
SchedRW = [WriteJump] in {
def RETPOLINE_CALL32 :
def INDIRECT_THUNK_CALL32 :
PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>,
Requires<[Not64BitMode,UseRetpolineIndirectCalls]>;
Requires<[Not64BitMode,UseIndirectThunkCalls]>;

def RETPOLINE_CALL64 :
def INDIRECT_THUNK_CALL64 :
PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>,
Requires<[In64BitMode,UseRetpolineIndirectCalls]>;
Requires<[In64BitMode,UseIndirectThunkCalls]>;

// Retpoline variant of indirect tail calls.
// Indirect thunk variant of indirect tail calls.
let isTerminator = 1, isReturn = 1, isBarrier = 1 in {
def RETPOLINE_TCRETURN64 :
def INDIRECT_THUNK_TCRETURN64 :
PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>;
def RETPOLINE_TCRETURN32 :
def INDIRECT_THUNK_TCRETURN32 :
PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>;
}
}
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/X86/X86InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -996,8 +996,8 @@ def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
def HasMFence : Predicate<"Subtarget->hasMFence()">;
def UseRetpolineIndirectCalls : Predicate<"Subtarget->useRetpolineIndirectCalls()">;
def NotUseRetpolineIndirectCalls : Predicate<"!Subtarget->useRetpolineIndirectCalls()">;
def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;

//===----------------------------------------------------------------------===//
// X86 Instruction Format Definitions.
Expand Down
900 changes: 900 additions & 0 deletions llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp

Large diffs are not rendered by default.

143 changes: 143 additions & 0 deletions llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
//===-- X86LoadValueInjectionRetHardening.cpp - LVI RET hardening for x86 --==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// Description: Replaces every `ret` instruction with the sequence:
/// ```
/// pop <scratch-reg>
/// lfence
/// jmp *<scratch-reg>
/// ```
/// where `<scratch-reg>` is some available scratch register, according to the
/// calling convention of the function being mitigated.
///
//===----------------------------------------------------------------------===//

#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86Subtarget.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
#include <bitset>

using namespace llvm;

#define PASS_KEY "x86-lvi-ret"
#define DEBUG_TYPE PASS_KEY

STATISTIC(NumFences, "Number of LFENCEs inserted for LVI mitigation");
STATISTIC(NumFunctionsConsidered, "Number of functions analyzed");
STATISTIC(NumFunctionsMitigated, "Number of functions for which mitigations "
"were deployed");

namespace {

class X86LoadValueInjectionRetHardeningPass : public MachineFunctionPass {
public:
X86LoadValueInjectionRetHardeningPass() : MachineFunctionPass(ID) {}
StringRef getPassName() const override {
return "X86 Load Value Injection (LVI) Ret-Hardening";
}
bool runOnMachineFunction(MachineFunction &MF) override;

static char ID;
};

} // end anonymous namespace

char X86LoadValueInjectionRetHardeningPass::ID = 0;

bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
<< " *****\n");
const X86Subtarget *Subtarget = &MF.getSubtarget<X86Subtarget>();
if (!Subtarget->useLVIControlFlowIntegrity() || !Subtarget->is64Bit())
return false; // FIXME: support 32-bit

// Don't skip functions with the "optnone" attr but participate in opt-bisect.
const Function &F = MF.getFunction();
if (!F.hasOptNone() && skipFunction(F))
return false;

++NumFunctionsConsidered;
const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
const X86InstrInfo *TII = Subtarget->getInstrInfo();
unsigned ClobberReg = X86::NoRegister;
std::bitset<X86::NUM_TARGET_REGS> UnclobberableGR64s;
UnclobberableGR64s.set(X86::RSP); // can't clobber stack pointer
UnclobberableGR64s.set(X86::RIP); // can't clobber instruction pointer
UnclobberableGR64s.set(X86::RAX); // used for function return
UnclobberableGR64s.set(X86::RDX); // used for function return

// We can clobber any register allowed by the function's calling convention.
for (const MCPhysReg *PR = TRI->getCalleeSavedRegs(&MF); auto Reg = *PR; ++PR)
UnclobberableGR64s.set(Reg);
for (auto &Reg : X86::GR64RegClass) {
if (!UnclobberableGR64s.test(Reg)) {
ClobberReg = Reg;
break;
}
}

if (ClobberReg != X86::NoRegister) {
LLVM_DEBUG(dbgs() << "Selected register "
<< Subtarget->getRegisterInfo()->getRegAsmName(ClobberReg)
<< " to clobber\n");
} else {
LLVM_DEBUG(dbgs() << "Could not find a register to clobber\n");
}

bool Modified = false;
for (auto &MBB : MF) {
if (MBB.empty())
continue;

MachineInstr &MI = MBB.back();
if (MI.getOpcode() != X86::RETQ)
continue;

if (ClobberReg != X86::NoRegister) {
MBB.erase_instr(&MI);
BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::POP64r))
.addReg(ClobberReg, RegState::Define)
.setMIFlag(MachineInstr::FrameDestroy);
BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::LFENCE));
BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::JMP64r))
.addReg(ClobberReg);
} else {
// In case there is no available scratch register, we can still read from
// RSP to assert that RSP points to a valid page. The write to RSP is
// also helpful because it verifies that the stack's write permissions
// are intact.
MachineInstr *Fence = BuildMI(MBB, MI, DebugLoc(), TII->get(X86::LFENCE));
addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)),
X86::RSP, false, 0)
.addImm(0)
->addRegisterDead(X86::EFLAGS, TRI);
}

++NumFences;
Modified = true;
}

if (Modified)
++NumFunctionsMitigated;
return Modified;
}

INITIALIZE_PASS(X86LoadValueInjectionRetHardeningPass, PASS_KEY,
"X86 LVI ret hardener", false, false)

FunctionPass *llvm::createX86LoadValueInjectionRetHardeningPass() {
return new X86LoadValueInjectionRetHardeningPass();
}
8 changes: 4 additions & 4 deletions llvm/lib/Target/X86/X86MCInstLower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1220,8 +1220,8 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
break;
case MachineOperand::MO_Register:
// FIXME: Add retpoline support and remove this.
if (Subtarget->useRetpolineIndirectCalls())
report_fatal_error("Lowering register statepoints with retpoline not "
if (Subtarget->useIndirectThunkCalls())
report_fatal_error("Lowering register statepoints with thunks not "
"yet implemented.");
CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
CallOpcode = X86::CALL64r;
Expand Down Expand Up @@ -1399,9 +1399,9 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
EmitAndCountInstruction(
MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
// FIXME: Add retpoline support and remove this.
if (Subtarget->useRetpolineIndirectCalls())
if (Subtarget->useIndirectThunkCalls())
report_fatal_error(
"Lowering patchpoint with retpoline not yet implemented.");
"Lowering patchpoint with thunks not yet implemented.");
EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
}

Expand Down
286 changes: 0 additions & 286 deletions llvm/lib/Target/X86/X86RetpolineThunks.cpp

This file was deleted.

27 changes: 25 additions & 2 deletions llvm/lib/Target/X86/X86Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,16 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// than emitting one inside the compiler.
bool UseRetpolineExternalThunk = false;

/// Prevent generation of indirect call/branch instructions from memory,
/// and force all indirect call/branch instructions from a register to be
/// preceded by an LFENCE. Also decompose RET instructions into a
/// POP+LFENCE+JMP sequence.
bool UseLVIControlFlowIntegrity = false;

/// Insert LFENCE instructions to prevent data speculatively injected into
/// loads from being used maliciously.
bool UseLVILoadHardening = false;

/// Use software floating point for code generation.
bool UseSoftFloat = false;

Expand Down Expand Up @@ -707,8 +717,21 @@ class X86Subtarget final : public X86GenSubtargetInfo {
return UseRetpolineIndirectBranches;
}
bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }

// These are generic getters that OR together all of the thunk types
// supported by the subtarget. Therefore useIndirectThunk*() will return true
// if any respective thunk feature is enabled.
bool useIndirectThunkCalls() const {
return useRetpolineIndirectCalls() || useLVIControlFlowIntegrity();
}
bool useIndirectThunkBranches() const {
return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity();
}

bool preferMaskRegisters() const { return PreferMaskRegisters; }
bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; }
bool useLVILoadHardening() const { return UseLVILoadHardening; }

unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
Expand Down Expand Up @@ -853,10 +876,10 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// Return true if the subtarget allows calls to immediate address.
bool isLegalToCallImmediateAddr() const;

/// If we are using retpolines, we need to expand indirectbr to avoid it
/// If we are using indirect thunks, we need to expand indirectbr to avoid it
/// lowering to an actual indirect jump.
bool enableIndirectBrExpand() const override {
return useRetpolineIndirectBranches();
return useIndirectThunkBranches();
}

/// Enable the MachineScheduler pass for all X86 subtargets.
Expand Down
9 changes: 8 additions & 1 deletion llvm/lib/Target/X86/X86TargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeX86SpeculativeLoadHardeningPassPass(PR);
initializeX86FlagsCopyLoweringPassPass(PR);
initializeX86CondBrFoldingPassPass(PR);
initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
initializeX86LoadValueInjectionRetHardeningPassPass(PR);
initializeX86OptimizeLEAPassPass(PR);
}

Expand Down Expand Up @@ -496,6 +498,10 @@ void X86PassConfig::addMachineSSAOptimization() {

void X86PassConfig::addPostRegAlloc() {
addPass(createX86FloatingPointStackifierPass());
if (getOptLevel() != CodeGenOpt::None)
addPass(createX86LoadValueInjectionLoadHardeningPass());
else
addPass(createX86LoadValueInjectionLoadHardeningUnoptimizedPass());
}

void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
Expand Down Expand Up @@ -525,7 +531,7 @@ void X86PassConfig::addPreEmitPass2() {
const Triple &TT = TM->getTargetTriple();
const MCAsmInfo *MAI = TM->getMCAsmInfo();

addPass(createX86RetpolineThunksPass());
addPass(createX86IndirectThunksPass());

// Insert extra int3 instructions after trailing call instructions to avoid
// issues in the unwinder.
Expand All @@ -542,6 +548,7 @@ void X86PassConfig::addPreEmitPass2() {
// Identify valid longjmp targets for Windows Control Flow Guard.
if (TT.isOSWindows())
addPass(createCFGuardLongjmpPass());
addPass(createX86LoadValueInjectionRetHardeningPass());
}

std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
Expand Down
5 changes: 4 additions & 1 deletion llvm/test/CodeGen/X86/O0-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
; CHECK-NEXT: Fast Register Allocator
; CHECK-NEXT: Bundle Machine CFG Edges
; CHECK-NEXT: X86 FP Stackifier
; CHECK-NEXT: X86 Load Value Injection (LVI) Load Hardening (Unoptimized)
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization
Expand All @@ -71,8 +72,9 @@
; CHECK-NEXT: Contiguously Lay Out Funclets
; CHECK-NEXT: StackMap Liveness Analysis
; CHECK-NEXT: Live DEBUG_VALUE analysis
; CHECK-NEXT: X86 Retpoline Thunks
; CHECK-NEXT: X86 Indirect Thunks
; CHECK-NEXT: Check CFA info and insert CFI instructions if needed
; CHECK-NEXT: X86 Load Value Injection (LVI) Ret-Hardening
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: X86 Assembly Printer
Expand All @@ -81,3 +83,4 @@
define void @f() {
ret void
}

8 changes: 6 additions & 2 deletions llvm/test/CodeGen/X86/O3-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,11 @@
; CHECK-NEXT: Machine Loop Invariant Code Motion
; CHECK-NEXT: Bundle Machine CFG Edges
; CHECK-NEXT: X86 FP Stackifier
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: Machine Dominance Frontier Construction
; CHECK-NEXT: X86 Load Value Injection (LVI) Load Hardening
; CHECK-NEXT: PostRA Machine Sink
; CHECK-NEXT: Machine Block Frequency Analysis
; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: MachinePostDominator Tree Construction
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
Expand Down Expand Up @@ -180,8 +182,9 @@
; CHECK-NEXT: Contiguously Lay Out Funclets
; CHECK-NEXT: StackMap Liveness Analysis
; CHECK-NEXT: Live DEBUG_VALUE analysis
; CHECK-NEXT: X86 Retpoline Thunks
; CHECK-NEXT: X86 Indirect Thunks
; CHECK-NEXT: Check CFA info and insert CFI instructions if needed
; CHECK-NEXT: X86 Load Value Injection (LVI) Ret-Hardening
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: X86 Assembly Printer
Expand All @@ -190,3 +193,4 @@
define void @f() {
ret void
}

129 changes: 129 additions & 0 deletions llvm/test/CodeGen/X86/lvi-hardening-gadget-graph.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -x86-lvi-load-dot-verify -o %t < %s | FileCheck %s

; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 @test(i32* %untrusted_user_ptr, i32* %secret, i32 %secret_size) #0 {
entry:
%untrusted_user_ptr.addr = alloca i32*, align 8
%secret.addr = alloca i32*, align 8
%secret_size.addr = alloca i32, align 4
%ret_val = alloca i32, align 4
%i = alloca i32, align 4
store i32* %untrusted_user_ptr, i32** %untrusted_user_ptr.addr, align 8
store i32* %secret, i32** %secret.addr, align 8
store i32 %secret_size, i32* %secret_size.addr, align 4
store i32 0, i32* %ret_val, align 4
call void @llvm.x86.sse2.lfence()
store i32 0, i32* %i, align 4
br label %for.cond

for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%1 = load i32, i32* %secret_size.addr, align 4
%cmp = icmp slt i32 %0, %1
br i1 %cmp, label %for.body, label %for.end

for.body: ; preds = %for.cond
%2 = load i32, i32* %i, align 4
%rem = srem i32 %2, 2
%cmp1 = icmp eq i32 %rem, 0
br i1 %cmp1, label %if.then, label %if.else

if.then: ; preds = %for.body
%3 = load i32*, i32** %secret.addr, align 8
%4 = load i32, i32* %ret_val, align 4
%idxprom = sext i32 %4 to i64
%arrayidx = getelementptr inbounds i32, i32* %3, i64 %idxprom
%5 = load i32, i32* %arrayidx, align 4
%6 = load i32*, i32** %untrusted_user_ptr.addr, align 8
store i32 %5, i32* %6, align 4
br label %if.end

if.else: ; preds = %for.body
%7 = load i32*, i32** %secret.addr, align 8
%8 = load i32, i32* %ret_val, align 4
%idxprom2 = sext i32 %8 to i64
%arrayidx3 = getelementptr inbounds i32, i32* %7, i64 %idxprom2
store i32 42, i32* %arrayidx3, align 4
br label %if.end

if.end: ; preds = %if.else, %if.then
%9 = load i32*, i32** %untrusted_user_ptr.addr, align 8
%10 = load i32, i32* %9, align 4
store i32 %10, i32* %ret_val, align 4
br label %for.inc

for.inc: ; preds = %if.end
%11 = load i32, i32* %i, align 4
%inc = add nsw i32 %11, 1
store i32 %inc, i32* %i, align 4
br label %for.cond

for.end: ; preds = %for.cond
%12 = load i32, i32* %ret_val, align 4
ret i32 %12
}

; CHECK: digraph "Speculative gadgets for \"test\" function" {
; CHECK-NEXT: label="Speculative gadgets for \"test\" function";
; CHECK: Node0x{{[0-9a-f]+}} [shape=record,color = green,label="{LFENCE\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 0];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $eax = MOV32rm %stack.4.i, 1, $noreg, 0, $noreg :: (dereferenceable load 4 from %ir.i)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{JCC_1 %bb.6, 13, implicit killed $eflags\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{CMP32rm killed renamable $eax, %stack.2.secret_size.addr, 1, $noreg, 0, $noreg, implicit-def $eflags :: (dereferenceable load 4 from %ir.secret_size.addr)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $eax = MOV32rm %stack.4.i, 1, $noreg, 0, $noreg :: (dereferenceable load 4 from %ir.i)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{JCC_1 %bb.4, 5, implicit killed $eflags\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $rax = MOV64rm %stack.1.secret.addr, 1, $noreg, 0, $noreg :: (dereferenceable load 8 from %ir.secret.addr)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $eax = MOV32rm killed renamable $rax, 4, killed renamable $rcx, 0, $noreg :: (load 4 from %ir.arrayidx)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $rcx = MOVSX64rm32 %stack.3.ret_val, 1, $noreg, 0, $noreg :: (dereferenceable load 4 from %ir.ret_val)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $rcx = MOV64rm %stack.0.untrusted_user_ptr.addr, 1, $noreg, 0, $noreg :: (dereferenceable load 8 from %ir.untrusted_user_ptr.addr)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{MOV32mr killed renamable $rcx, 1, $noreg, 0, $noreg, killed renamable $eax :: (store 4 into %ir.6)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $rax = MOV64rm %stack.1.secret.addr, 1, $noreg, 0, $noreg :: (dereferenceable load 8 from %ir.secret.addr)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{MOV32mi killed renamable $rax, 4, killed renamable $rcx, 0, $noreg, 42 :: (store 4 into %ir.arrayidx3)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $rcx = MOVSX64rm32 %stack.3.ret_val, 1, $noreg, 0, $noreg :: (dereferenceable load 4 from %ir.ret_val)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $rax = MOV64rm %stack.0.untrusted_user_ptr.addr, 1, $noreg, 0, $noreg :: (dereferenceable load 8 from %ir.untrusted_user_ptr.addr)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[color = red, style = "dashed"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $eax = MOV32rm killed renamable $rax, 1, $noreg, 0, $noreg :: (load 4 from %ir.9)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,color = blue,label="{ARGS}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 0];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{MOV64mr %stack.0.untrusted_user_ptr.addr, 1, $noreg, 0, $noreg, killed renamable $rdi :: (store 8 into %ir.untrusted_user_ptr.addr)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 0];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{JMP_1 %bb.5\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{JMP_1 %bb.1\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 1];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{renamable $eax = MOV32rm %stack.3.ret_val, 1, $noreg, 0, $noreg :: (dereferenceable load 4 from %ir.ret_val)\n}"];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} -> Node0x{{[0-9a-f]+}}[label = 0];
; CHECK-NEXT: Node0x{{[0-9a-f]+}} [shape=record,label="{RET 0, $eax\n}"];
; CHECK-NEXT: }

; Function Attrs: nounwind
declare void @llvm.x86.sse2.lfence() #1

attributes #0 = { "target-features"="+lvi-cfi"
"target-features"="+lvi-load-hardening" }
attributes #1 = { nounwind }
281 changes: 281 additions & 0 deletions llvm/test/CodeGen/X86/lvi-hardening-indirectbr.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -mattr=+lvi-cfi < %s | FileCheck %s --check-prefix=X64
; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -mattr=+lvi-cfi -O0 < %s | FileCheck %s --check-prefix=X64FAST
;
; Note that a lot of this code was lifted from retpoline.ll.

declare void @bar(i32)

; Test a simple indirect call and tail call.
define void @icall_reg(void (i32)* %fp, i32 %x) {
entry:
tail call void @bar(i32 %x)
tail call void %fp(i32 %x)
tail call void @bar(i32 %x)
tail call void %fp(i32 %x)
ret void
}

; X64-LABEL: icall_reg:
; X64-DAG: movq %rdi, %[[fp:[^ ]*]]
; X64-DAG: movl %esi, %[[x:[^ ]*]]
; X64: movl %esi, %edi
; X64: callq bar
; X64-DAG: movl %[[x]], %edi
; X64-DAG: movq %[[fp]], %r11
; X64: callq __llvm_lvi_thunk_r11
; X64: movl %[[x]], %edi
; X64: callq bar
; X64-DAG: movl %[[x]], %edi
; X64-DAG: movq %[[fp]], %r11
; X64: jmp __llvm_lvi_thunk_r11 # TAILCALL

; X64FAST-LABEL: icall_reg:
; X64FAST: callq bar
; X64FAST: callq __llvm_lvi_thunk_r11
; X64FAST: callq bar
; X64FAST: jmp __llvm_lvi_thunk_r11 # TAILCALL


@global_fp = external global void (i32)*

; Test an indirect call through a global variable.
define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 {
%fp1 = load void (i32)*, void (i32)** @global_fp
call void %fp1(i32 %x)
%fp2 = load void (i32)*, void (i32)** @global_fp
tail call void %fp2(i32 %x)
ret void
}

; X64-LABEL: icall_global_fp:
; X64-DAG: movl %edi, %[[x:[^ ]*]]
; X64-DAG: movq global_fp(%rip), %r11
; X64: callq __llvm_lvi_thunk_r11
; X64-DAG: movl %[[x]], %edi
; X64-DAG: movq global_fp(%rip), %r11
; X64: jmp __llvm_lvi_thunk_r11 # TAILCALL

; X64FAST-LABEL: icall_global_fp:
; X64FAST: movq global_fp(%rip), %r11
; X64FAST: callq __llvm_lvi_thunk_r11
; X64FAST: movq global_fp(%rip), %r11
; X64FAST: jmp __llvm_lvi_thunk_r11 # TAILCALL


%struct.Foo = type { void (%struct.Foo*)** }

; Test an indirect call through a vtable.
define void @vcall(%struct.Foo* %obj) #0 {
%vptr_field = getelementptr %struct.Foo, %struct.Foo* %obj, i32 0, i32 0
%vptr = load void (%struct.Foo*)**, void (%struct.Foo*)*** %vptr_field
%vslot = getelementptr void(%struct.Foo*)*, void(%struct.Foo*)** %vptr, i32 1
%fp = load void(%struct.Foo*)*, void(%struct.Foo*)** %vslot
tail call void %fp(%struct.Foo* %obj)
tail call void %fp(%struct.Foo* %obj)
ret void
}

; X64-LABEL: vcall:
; X64: movq %rdi, %[[obj:[^ ]*]]
; X64: movq (%rdi), %[[vptr:[^ ]*]]
; X64: movq 8(%[[vptr]]), %[[fp:[^ ]*]]
; X64: movq %[[fp]], %r11
; X64: callq __llvm_lvi_thunk_r11
; X64-DAG: movq %[[obj]], %rdi
; X64-DAG: movq %[[fp]], %r11
; X64: jmp __llvm_lvi_thunk_r11 # TAILCALL

; X64FAST-LABEL: vcall:
; X64FAST: callq __llvm_lvi_thunk_r11
; X64FAST: jmp __llvm_lvi_thunk_r11 # TAILCALL


declare void @direct_callee()

define void @direct_tail() #0 {
tail call void @direct_callee()
ret void
}

; X64-LABEL: direct_tail:
; X64: jmp direct_callee # TAILCALL
; X64FAST-LABEL: direct_tail:
; X64FAST: jmp direct_callee # TAILCALL


declare void @nonlazybind_callee() #1

define void @nonlazybind_caller() #0 {
call void @nonlazybind_callee()
tail call void @nonlazybind_callee()
ret void
}

; X64-LABEL: nonlazybind_caller:
; X64: movq nonlazybind_callee@GOTPCREL(%rip), %[[REG:.*]]
; X64: movq %[[REG]], %r11
; X64: callq __llvm_lvi_thunk_r11
; X64: movq %[[REG]], %r11
; X64: jmp __llvm_lvi_thunk_r11 # TAILCALL
; X64FAST-LABEL: nonlazybind_caller:
; X64FAST: movq nonlazybind_callee@GOTPCREL(%rip), %r11
; X64FAST: callq __llvm_lvi_thunk_r11
; X64FAST: movq nonlazybind_callee@GOTPCREL(%rip), %r11
; X64FAST: jmp __llvm_lvi_thunk_r11 # TAILCALL


; Check that a switch gets lowered using a jump table
define void @switch_jumptable(i32* %ptr, i64* %sink) #0 {
; X64-LABEL: switch_jumptable:
; X64_NOT: jmpq *
entry:
br label %header

header:
%i = load volatile i32, i32* %ptr
switch i32 %i, label %bb0 [
i32 1, label %bb1
i32 2, label %bb2
i32 3, label %bb3
i32 4, label %bb4
i32 5, label %bb5
i32 6, label %bb6
i32 7, label %bb7
i32 8, label %bb8
i32 9, label %bb9
]

bb0:
store volatile i64 0, i64* %sink
br label %header

bb1:
store volatile i64 1, i64* %sink
br label %header

bb2:
store volatile i64 2, i64* %sink
br label %header

bb3:
store volatile i64 3, i64* %sink
br label %header

bb4:
store volatile i64 4, i64* %sink
br label %header

bb5:
store volatile i64 5, i64* %sink
br label %header

bb6:
store volatile i64 6, i64* %sink
br label %header

bb7:
store volatile i64 7, i64* %sink
br label %header

bb8:
store volatile i64 8, i64* %sink
br label %header

bb9:
store volatile i64 9, i64* %sink
br label %header
}


@indirectbr_rewrite.targets = constant [10 x i8*] [i8* blockaddress(@indirectbr_rewrite, %bb0),
i8* blockaddress(@indirectbr_rewrite, %bb1),
i8* blockaddress(@indirectbr_rewrite, %bb2),
i8* blockaddress(@indirectbr_rewrite, %bb3),
i8* blockaddress(@indirectbr_rewrite, %bb4),
i8* blockaddress(@indirectbr_rewrite, %bb5),
i8* blockaddress(@indirectbr_rewrite, %bb6),
i8* blockaddress(@indirectbr_rewrite, %bb7),
i8* blockaddress(@indirectbr_rewrite, %bb8),
i8* blockaddress(@indirectbr_rewrite, %bb9)]

; Check that when thunks are enabled the indirectbr instruction gets
; rewritten to use switch, and that in turn doesn't get lowered as a jump
; table.
define void @indirectbr_rewrite(i64* readonly %p, i64* %sink) #0 {
; X64-LABEL: indirectbr_rewrite:
; X64-NOT: jmpq *
entry:
%i0 = load i64, i64* %p
%target.i0 = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i0
%target0 = load i8*, i8** %target.i0
indirectbr i8* %target0, [label %bb1, label %bb3]

bb0:
store volatile i64 0, i64* %sink
br label %latch

bb1:
store volatile i64 1, i64* %sink
br label %latch

bb2:
store volatile i64 2, i64* %sink
br label %latch

bb3:
store volatile i64 3, i64* %sink
br label %latch

bb4:
store volatile i64 4, i64* %sink
br label %latch

bb5:
store volatile i64 5, i64* %sink
br label %latch

bb6:
store volatile i64 6, i64* %sink
br label %latch

bb7:
store volatile i64 7, i64* %sink
br label %latch

bb8:
store volatile i64 8, i64* %sink
br label %latch

bb9:
store volatile i64 9, i64* %sink
br label %latch

latch:
%i.next = load i64, i64* %p
%target.i.next = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i.next
%target.next = load i8*, i8** %target.i.next
; Potentially hit a full 10 successors here so that even if we rewrite as
; a switch it will try to be lowered with a jump table.
indirectbr i8* %target.next, [label %bb0,
label %bb1,
label %bb2,
label %bb3,
label %bb4,
label %bb5,
label %bb6,
label %bb7,
label %bb8,
label %bb9]
}

; Lastly check that the necessary thunks were emitted.
;
; X64-LABEL: .section .text.__llvm_lvi_thunk_r11,{{.*}},__llvm_lvi_thunk_r11,comdat
; X64-NEXT: .hidden __llvm_lvi_thunk_r11
; X64-NEXT: .weak __llvm_lvi_thunk_r11
; X64: __llvm_lvi_thunk_r11:
; X64-NEXT: # {{.*}} # %entry
; X64-NEXT: lfence
; X64-NEXT: jmpq *%r11

attributes #1 = { nonlazybind }
144 changes: 144 additions & 0 deletions llvm/test/CodeGen/X86/lvi-hardening-loads.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64-ALL
; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown --x86-lvi-load-no-cbranch < %s | FileCheck %s --check-prefix=X64
; RUN: llc -O0 -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --check-prefix=X64-NOOPT

; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 @test(i32** %secret, i32 %secret_size) #0 {
; X64-LABEL: test:
entry:
%secret.addr = alloca i32**, align 8
%secret_size.addr = alloca i32, align 4
%ret_val = alloca i32, align 4
%i = alloca i32, align 4
store i32** %secret, i32*** %secret.addr, align 8
store i32 %secret_size, i32* %secret_size.addr, align 4
store i32 0, i32* %ret_val, align 4
call void @llvm.x86.sse2.lfence()
store i32 0, i32* %i, align 4
br label %for.cond

; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
; X64-NEXT: movl $0, -{{[0-9]+}}(%rsp)
; X64-NEXT: lfence
; X64-NEXT: movl $0, -{{[0-9]+}}(%rsp)
; X64-NEXT: jmp .LBB0_1

; X64-NOOPT: # %bb.0: # %entry
; X64-NOOPT-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NOOPT-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
; X64-NOOPT-NEXT: movl $0, -{{[0-9]+}}(%rsp)
; X64-NOOPT-NEXT: lfence
; X64-NOOPT-NEXT: movl $0, -{{[0-9]+}}(%rsp)

for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%1 = load i32, i32* %secret_size.addr, align 4
%cmp = icmp slt i32 %0, %1
br i1 %cmp, label %for.body, label %for.end

; X64: .LBB0_1: # %for.cond
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; X64-ALL-NEXT: lfence
; X64-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
; X64-ALL-NEXT: lfence
; X64-NEXT: jge .LBB0_5

; X64-NOOPT: .LBB0_1: # %for.cond
; X64-NOOPT-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; X64-NOOPT-NEXT: lfence
; X64-NOOPT-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
; X64-NOOPT-NEXT: lfence
; X64-NOOPT-NEXT: jge .LBB0_6

for.body: ; preds = %for.cond
%2 = load i32, i32* %i, align 4
%rem = srem i32 %2, 2
%cmp1 = icmp eq i32 %rem, 0
br i1 %cmp1, label %if.then, label %if.end

; X64: # %bb.2: # %for.body
; X64-NEXT: # in Loop: Header=BB0_1 Depth=1
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; X64-ALL-NEXT: lfence
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shrl $31, %ecx
; X64-NEXT: addl %eax, %ecx
; X64-NEXT: andl $-2, %ecx
; X64-NEXT: cmpl %ecx, %eax
; X64-NEXT: jne .LBB0_4

; X64-NOOPT: # %bb.2: # %for.body
; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; X64-NOOPT-NEXT: lfence
; X64-NOOPT-NEXT: cltd
; X64-NOOPT-NEXT: movl $2, %ecx
; X64-NOOPT-NEXT: idivl %ecx
; X64-NOOPT-NEXT: cmpl $0, %edx
; X64-NOOPT-NEXT: jne .LBB0_4

if.then: ; preds = %for.body
%3 = load i32**, i32*** %secret.addr, align 8
%4 = load i32, i32* %ret_val, align 4
%idxprom = sext i32 %4 to i64
%arrayidx = getelementptr inbounds i32*, i32** %3, i64 %idxprom
%5 = load i32*, i32** %arrayidx, align 8
%6 = load i32, i32* %5, align 4
store i32 %6, i32* %ret_val, align 4
br label %if.end

; X64: # %bb.3: # %if.then
; X64-NEXT: # in Loop: Header=BB0_1 Depth=1
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: lfence
; X64-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
; X64-NEXT: lfence
; X64-NEXT: movq (%rax,%rcx,8), %rax
; X64-NEXT: lfence
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
; X64-NEXT: jmp .LBB0_4

; X64-NOOPT: # %bb.3: # %if.then
; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
; X64-NOOPT-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; X64-NOOPT-NEXT: lfence
; X64-NOOPT-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
; X64-NOOPT-NEXT: lfence
; X64-NOOPT-NEXT: movq (%rax,%rcx,8), %rax
; X64-NOOPT-NEXT: lfence
; X64-NOOPT-NEXT: movl (%rax), %edx
; X64-NOOPT-NEXT: lfence
; X64-NOOPT-NEXT: movl %edx, -{{[0-9]+}}(%rsp)

if.end: ; preds = %if.then, %for.body
br label %for.inc

for.inc: ; preds = %if.end
%7 = load i32, i32* %i, align 4
%inc = add nsw i32 %7, 1
store i32 %inc, i32* %i, align 4
br label %for.cond

; X64-NOOPT: .LBB0_5: # %for.inc
; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; X64-NOOPT-NEXT: lfence
; X64-NOOPT-NEXT: addl $1, %eax
; X64-NOOPT-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
; X64-NOOPT-NEXT: jmp .LBB0_1

for.end: ; preds = %for.cond
%8 = load i32, i32* %ret_val, align 4
ret i32 %8
}

; Function Attrs: nounwind
declare void @llvm.x86.sse2.lfence() #1

attributes #0 = { "target-features"="+lvi-load-hardening" }
attributes #1 = { nounwind }
72 changes: 72 additions & 0 deletions llvm/test/CodeGen/X86/lvi-hardening-ret.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s

define dso_local void @one_instruction() #0 {
; CHECK-LABEL: one_instruction:
entry:
ret void
; CHECK-NOT: retq
; CHECK: popq %[[x:[^ ]*]]
; CHECK-NEXT: lfence
; CHECK-NEXT: jmpq *%[[x]]
}

; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 @ordinary_function(i32 %x, i32 %y) #0 {
; CHECK-LABEL: ordinary_function:
entry:
%x.addr = alloca i32, align 4
%y.addr = alloca i32, align 4
store i32 %x, i32* %x.addr, align 4
store i32 %y, i32* %y.addr, align 4
%0 = load i32, i32* %x.addr, align 4
%1 = load i32, i32* %y.addr, align 4
%add = add nsw i32 %0, %1
ret i32 %add
; CHECK-NOT: retq
; CHECK: popq %[[x:[^ ]*]]
; CHECK-NEXT: lfence
; CHECK-NEXT: jmpq *%[[x]]
}

; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 @no_caller_saved_registers_function(i32 %x, i32 %y) #1 {
; CHECK-LABEL: no_caller_saved_registers_function:
entry:
%x.addr = alloca i32, align 4
%y.addr = alloca i32, align 4
store i32 %x, i32* %x.addr, align 4
store i32 %y, i32* %y.addr, align 4
%0 = load i32, i32* %x.addr, align 4
%1 = load i32, i32* %y.addr, align 4
%add = add nsw i32 %0, %1
ret i32 %add
; CHECK-NOT: retq
; CHECK: shlq $0, (%{{[^ ]*}})
; CHECK-NEXT: lfence
; CHECK-NEXT: retq
}

; Function Attrs: noinline nounwind optnone uwtable
define dso_local preserve_mostcc void @preserve_most() #0 {
; CHECK-LABEL: preserve_most:
entry:
ret void
; CHECK-NOT: retq
; CHECK: popq %r11
; CHECK-NEXT: lfence
; CHECK-NEXT: jmpq *%r11
}

; Function Attrs: noinline nounwind optnone uwtable
define dso_local preserve_allcc void @preserve_all() #0 {
; CHECK-LABEL: preserve_all:
entry:
ret void
; CHECK-NOT: retq
; CHECK: popq %r11
; CHECK-NEXT: lfence
; CHECK-NEXT: jmpq *%r11
}

attributes #0 = { "target-features"="+lvi-cfi" }
attributes #1 = { "no_caller_saved_registers" "target-features"="+lvi-cfi" }