341 changes: 341 additions & 0 deletions llvm/lib/Transforms/Scalar/TLSVariableHoist.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,341 @@
//===- TLSVariableHoist.cpp -------- Remove Redundant TLS Loads ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass identifies/eliminate Redundant TLS Loads if related option is set.
// The example: Please refer to the comment at the head of TLSVariableHoist.h.
//
//===----------------------------------------------------------------------===//

#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/TLSVariableHoist.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <iterator>
#include <tuple>
#include <utility>

using namespace llvm;
using namespace tlshoist;

#define DEBUG_TYPE "tlshoist"

// TODO: Support "strict" model if we need to strictly load TLS address,
// because "non-optimize" may also do some optimization in other passes.
static cl::opt<std::string> TLSLoadHoist(
"tls-load-hoist",
cl::desc(
"hoist the TLS loads in PIC model: "
"tls-load-hoist=optimize: Eleminate redundant TLS load(s)."
"tls-load-hoist=strict: Strictly load TLS address before every use."
"tls-load-hoist=non-optimize: Generally load TLS before use(s)."),
cl::init("non-optimize"), cl::Hidden);

namespace {

/// The TLS Variable hoist pass.
class TLSVariableHoistLegacyPass : public FunctionPass {
public:
static char ID; // Pass identification, replacement for typeid

TLSVariableHoistLegacyPass() : FunctionPass(ID) {
initializeTLSVariableHoistLegacyPassPass(*PassRegistry::getPassRegistry());
}

bool runOnFunction(Function &Fn) override;

StringRef getPassName() const override { return "TLS Variable Hoist"; }

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
}

private:
TLSVariableHoistPass Impl;
};

} // end anonymous namespace

char TLSVariableHoistLegacyPass::ID = 0;

INITIALIZE_PASS_BEGIN(TLSVariableHoistLegacyPass, "tlshoist",
"TLS Variable Hoist", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(TLSVariableHoistLegacyPass, "tlshoist",
"TLS Variable Hoist", false, false)

FunctionPass *llvm::createTLSVariableHoistPass() {
return new TLSVariableHoistLegacyPass();
}

/// Perform the TLS Variable Hoist optimization for the given function.
bool TLSVariableHoistLegacyPass::runOnFunction(Function &Fn) {
if (skipFunction(Fn))
return false;

LLVM_DEBUG(dbgs() << "********** Begin TLS Variable Hoist **********\n");
LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');

bool MadeChange =
Impl.runImpl(Fn, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
getAnalysis<LoopInfoWrapperPass>().getLoopInfo());

if (MadeChange) {
LLVM_DEBUG(dbgs() << "********** Function after TLS Variable Hoist: "
<< Fn.getName() << '\n');
LLVM_DEBUG(dbgs() << Fn);
}
LLVM_DEBUG(dbgs() << "********** End TLS Variable Hoist **********\n");

return MadeChange;
}

void TLSVariableHoistPass::collectTLSCandidate(Instruction *Inst) {
// Skip all cast instructions. They are visited indirectly later on.
if (Inst->isCast())
return;

// Scan all operands.
for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
auto *GV = dyn_cast<GlobalVariable>(Inst->getOperand(Idx));
if (!GV || !GV->isThreadLocal())
continue;

// Add Candidate to TLSCandMap (GV --> Candidate).
TLSCandMap[GV].addUser(Inst, Idx);
}
}

void TLSVariableHoistPass::collectTLSCandidates(Function &Fn) {
// First, quickly check if there is TLS Variable.
Module *M = Fn.getParent();

bool HasTLS = llvm::any_of(
M->globals(), [](GlobalVariable &GV) { return GV.isThreadLocal(); });

// If non, directly return.
if (!HasTLS)
return;

TLSCandMap.clear();

// Then, collect TLS Variable info.
for (BasicBlock &BB : Fn) {
// Ignore unreachable basic blocks.
if (!DT->isReachableFromEntry(&BB))
continue;

for (Instruction &Inst : BB)
collectTLSCandidate(&Inst);
}
}

static bool OneUseOutsideLoop(tlshoist::TLSCandidate &Cand, LoopInfo *LI) {
if (Cand.Users.size() != 1)
return false;

BasicBlock *BB = Cand.Users[0].Inst->getParent();
if (LI && LI->getLoopFor(BB))
return false;

return true;
}

BasicBlock::iterator
TLSVariableHoistPass::findInsertPosInEntry(Function &Fn,
tlshoist::TLSCandidate &Cand) {
BasicBlock &Entry = Fn.getEntryBlock();

// The Entry BB is usually small, let quickly check if TLS used in it.
// If there is, directly use the first user as insert position.
for (auto &I : Entry) {
Instruction *Inst = &I;
bool UsedInEntry = llvm::any_of(
Cand.Users, [=](tlshoist::TLSUser &User) { return User.Inst == Inst; });
if (UsedInEntry)
return Inst->getIterator();
}

Instruction *Term = Entry.getTerminator();
if (Term)
return Term->getIterator();

// Entry is empty.
return Entry.end();
}

Instruction *TLSVariableHoistPass::getNearestLoopDomInst(BasicBlock *BB) {
Loop *L = LI->getLoopFor(BB);
assert(L && "Unexcepted Loop status!");

// Get the outmost loop.
while (Loop *Parent = L->getParentLoop())
L = Parent;

BasicBlock *PreHeader = L->getLoopPredecessor();

// There is unique predecessor outside the loop.
// Note the terminator maybe nullptr, because the PreHeader maybe an empty BB.
if (PreHeader)
return PreHeader->getTerminator();

BasicBlock *Header = L->getHeader();
BasicBlock *Dom = Header;
for (BasicBlock *PredBB : predecessors(Header))
Dom = DT->findNearestCommonDominator(Dom, PredBB);

assert(Dom && "Not find dominator BB!");
Instruction *Term = Dom->getTerminator();

assert(Term && "Not find terminator instruction!");
return Term;
}

Instruction *TLSVariableHoistPass::getDomInst(Instruction *I1,
Instruction *I2) {
if (!I1)
return I2;
if (DT->dominates(I1, I2))
return I1;
if (DT->dominates(I2, I1))
return I2;

// If there is no dominance relation, use common dominator.
BasicBlock *DomBB =
DT->findNearestCommonDominator(I1->getParent(), I2->getParent());

Instruction *Dom = DomBB->getTerminator();
assert(Dom && "Common dominator not found!");

return Dom;
}

BasicBlock::iterator TLSVariableHoistPass::findInsertPos(Function &Fn,
GlobalVariable *GV,
BasicBlock *&PosBB) {
tlshoist::TLSCandidate &Cand = TLSCandMap[GV];
if (!DT)
return findInsertPosInEntry(Fn, Cand);

// We should hoist the TLS use out of loop, so choose its nearest instruction
// which dominate the loop and the outside loops (if exist).
Instruction *LastPos = nullptr;
for (auto &User : Cand.Users) {
BasicBlock *BB = User.Inst->getParent();
Instruction *Pos = User.Inst;
if (LI && LI->getLoopFor(BB)) {
Pos = getNearestLoopDomInst(BB);
// The dominator of loop is empty BB, that rarely happened, so let
// things be easy, directly insert in entry BB.
if (!Pos)
return findInsertPosInEntry(Fn, Cand);
}
Pos = getDomInst(LastPos, Pos);
LastPos = Pos;
}

assert(LastPos && "Unexpected insert position!");
BasicBlock *Parent = LastPos->getParent();
PosBB = Parent;
return LastPos->getIterator();
}

// Generate a bitcast (no type change) to replace the uses of TLS Candidate.
Instruction *TLSVariableHoistPass::genBitCastInst(Function &Fn,
GlobalVariable *GV) {
BasicBlock *PosBB = &Fn.getEntryBlock();
BasicBlock::iterator Iter = findInsertPos(Fn, GV, PosBB);
Type *Ty = GV->getType();
auto *CastInst = new BitCastInst(GV, Ty, "tls_bitcast");
PosBB->getInstList().insert(Iter, CastInst);
return CastInst;
}

bool TLSVariableHoistPass::tryReplaceTLSCandidate(Function &Fn,
GlobalVariable *GV) {

tlshoist::TLSCandidate &Cand = TLSCandMap[GV];

// If only used 1 time and not in loops, we no need to replace it.
if (OneUseOutsideLoop(Cand, LI))
return false;

// Generate a bitcast (no type change)
auto *CastInst = genBitCastInst(Fn, GV);

// to replace the uses of TLS Candidate
for (auto &User : Cand.Users)
User.Inst->setOperand(User.OpndIdx, CastInst);

return true;
}

bool TLSVariableHoistPass::tryReplaceTLSCandidates(Function &Fn) {
if (TLSCandMap.empty())
return false;

bool Replaced = false;
for (auto &GV2Cand : TLSCandMap) {
GlobalVariable *GV = GV2Cand.first;
Replaced |= tryReplaceTLSCandidate(Fn, GV);
}

return Replaced;
}

/// Optimize expensive TLS variables in the given function.
bool TLSVariableHoistPass::runImpl(Function &Fn, DominatorTree &DT,
LoopInfo &LI) {
if (Fn.hasOptNone())
return false;

if (TLSLoadHoist != "optimize" &&
!Fn.getAttributes().hasFnAttr("tls-load-hoist"))
return false;

this->LI = &LI;
this->DT = &DT;
// Collect all TLS variable candidates.
collectTLSCandidates(Fn);

bool MadeChange = tryReplaceTLSCandidates(Fn);

return MadeChange;
}

PreservedAnalyses TLSVariableHoistPass::run(Function &F,
FunctionAnalysisManager &AM) {

auto &LI = AM.getResult<LoopAnalysis>(F);
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);

if (!runImpl(F, DT, LI))
return PreservedAnalyses::all();

PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
return PA;
}
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AArch64/O3-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
; CHECK-NEXT: Expand vector predication intrinsics
; CHECK-NEXT: Scalarize Masked Memory Intrinsics
; CHECK-NEXT: Expand reduction intrinsics
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: TLS Variable Hoist
; CHECK-NEXT: Stack Safety Analysis
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Dominator Tree Construction
Expand Down
7 changes: 7 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@
; GCN-O1-NEXT: Expand vector predication intrinsics
; GCN-O1-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O1-NEXT: Expand reduction intrinsics
; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: TLS Variable Hoist
; GCN-O1-NEXT: AMDGPU Attributor
; GCN-O1-NEXT: CallGraph Construction
; GCN-O1-NEXT: Call Graph SCC Pass Manager
Expand Down Expand Up @@ -484,6 +486,8 @@
; GCN-O1-OPTS-NEXT: Expand vector predication intrinsics
; GCN-O1-OPTS-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O1-OPTS-NEXT: Expand reduction intrinsics
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: TLS Variable Hoist
; GCN-O1-OPTS-NEXT: Early CSE
; GCN-O1-OPTS-NEXT: AMDGPU Attributor
; GCN-O1-OPTS-NEXT: CallGraph Construction
Expand Down Expand Up @@ -769,6 +773,8 @@
; GCN-O2-NEXT: Expand vector predication intrinsics
; GCN-O2-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O2-NEXT: Expand reduction intrinsics
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: TLS Variable Hoist
; GCN-O2-NEXT: Early CSE
; GCN-O2-NEXT: AMDGPU Attributor
; GCN-O2-NEXT: CallGraph Construction
Expand Down Expand Up @@ -1062,6 +1068,7 @@
; GCN-O3-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O3-NEXT: Expand reduction intrinsics
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: TLS Variable Hoist
; GCN-O3-NEXT: Phi Values Analysis
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O3-NEXT: Function Alias Analysis Results
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/ARM/O3-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
; CHECK-NEXT: Scalarize Masked Memory Intrinsics
; CHECK-NEXT: Expand reduction intrinsics
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: TLS Variable Hoist
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/PowerPC/O3-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
; CHECK-NEXT: Scalarize Masked Memory Intrinsics
; CHECK-NEXT: Expand reduction intrinsics
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: TLS Variable Hoist
; CHECK-NEXT: CodeGen Prepare
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Exception handling preparation
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/X86/opt-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
; CHECK-NEXT: Expand vector predication intrinsics
; CHECK-NEXT: Scalarize Masked Memory Intrinsics
; CHECK-NEXT: Expand reduction intrinsics
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: TLS Variable Hoist
; CHECK-NEXT: Interleaved Access Pass
; CHECK-NEXT: X86 Partial Reduction
; CHECK-NEXT: Expand indirectbr instructions
Expand Down
248 changes: 248 additions & 0 deletions llvm/test/CodeGen/X86/tls-loads-control.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-hoist=optimize --stop-after=tlshoist -o - %s | FileCheck %s
; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --stop-after=tlshoist -o - %s | FileCheck %s

; This test come from compiling clang/test/CodeGen/intel/tls_loads.cpp with:
; (clang tls_loads.cpp -fPIC -ftls-model=global-dynamic -O2 -S -emit-llvm)

; // Variable declaration and definition:
; thread_local int thl_x;
; thread_local int thl_x2;
;
; struct SS {
; char thl_c;
; int num;
; };
;
; int gfunc();
; int gfunc2(int);

; // First function (@_Z2f1i):
; int f1(int c) {
; while (c)
; c++;
;
; int *px = &thl_x;
; c -= gfunc();
;
; while(c++) {
; c = gfunc();
; while (c--)
; *px += gfunc2(thl_x2);
; }
; return *px;
; }

$_ZTW5thl_x = comdat any

$_ZTW6thl_x2 = comdat any

@thl_x = thread_local global i32 0, align 4
@thl_x2 = thread_local global i32 0, align 4
@_ZZ2f2iE2st.0 = internal thread_local unnamed_addr global i8 0, align 4
@_ZZ2f2iE2st.1 = internal thread_local unnamed_addr global i32 0, align 4

; Function Attrs: mustprogress uwtable
define noundef i32 @_Z2f1i(i32 noundef %c) local_unnamed_addr #0 {
; CHECK-LABEL: _Z2f1i
; CHECK: entry:
; CHECK-NEXT: %call = tail call noundef i32 @_Z5gfuncv()
; CHECK-NEXT: %phi.cmp = icmp eq i32 %call, 0
; CHECK-NEXT: %tls_bitcast1 = bitcast i32* @thl_x to i32*
; CHECK-NEXT: br i1 %phi.cmp, label %while.end11, label %while.body4.preheader

; CHECK: while.body4.preheader:
; CHECK-NEXT: %tls_bitcast = bitcast i32* @thl_x2 to i32*
; CHECK-NEXT: br label %while.body4

; CHECK: while.body4:
; CHECK-NEXT: %call5 = tail call noundef i32 @_Z5gfuncv()
; CHECK-NEXT: %tobool7.not18 = icmp eq i32 %call5, 0
; CHECK-NEXT: br i1 %tobool7.not18, label %while.body4.backedge, label %while.body8.preheader

; CHECK: while.body8.preheader:
; CHECK-NEXT: br label %while.body8

; CHECK: while.body4.backedge.loopexit:
; CHECK-NEXT: br label %while.body4.backedge

; CHECK: while.body4.backedge:
; CHECK-NEXT: br label %while.body4, !llvm.loop !4

; CHECK: while.body8:
; CHECK-NEXT: %c.addr.219 = phi i32 [ %dec, %while.body8 ], [ %call5, %while.body8.preheader ]
; CHECK-NEXT: %dec = add i32 %c.addr.219, -1
; CHECK-NEXT: %0 = load i32, i32* %tls_bitcast, align 4
; CHECK-NEXT: %call9 = tail call noundef i32 @_Z6gfunc2i(i32 noundef %0)
; CHECK-NEXT: %1 = load i32, i32* %tls_bitcast1, align 4
; CHECK-NEXT: %add = add nsw i32 %1, %call9
; CHECK-NEXT: store i32 %add, i32* %tls_bitcast1, align 4
; CHECK-NEXT: %tobool7.not = icmp eq i32 %dec, 0
; CHECK-NEXT: br i1 %tobool7.not, label %while.body4.backedge.loopexit, label %while.body8, !llvm.loop !4

; CHECK: while.end11:
; CHECK-NEXT: %2 = load i32, i32* %tls_bitcast1, align 4
; CHECK-NEXT: ret i32 %2

entry:
%call = tail call noundef i32 @_Z5gfuncv()
%phi.cmp = icmp eq i32 %call, 0
br i1 %phi.cmp, label %while.end11, label %while.body4

while.body4: ; preds = %entry, %while.body4.backedge
%call5 = tail call noundef i32 @_Z5gfuncv()
%tobool7.not18 = icmp eq i32 %call5, 0
br i1 %tobool7.not18, label %while.body4.backedge, label %while.body8

while.body4.backedge: ; preds = %while.body8, %while.body4
br label %while.body4, !llvm.loop !4

while.body8: ; preds = %while.body4, %while.body8
%c.addr.219 = phi i32 [ %dec, %while.body8 ], [ %call5, %while.body4 ]
%dec = add nsw i32 %c.addr.219, -1
%0 = load i32, i32* @thl_x2, align 4
%call9 = tail call noundef i32 @_Z6gfunc2i(i32 noundef %0)
%1 = load i32, i32* @thl_x, align 4
%add = add nsw i32 %1, %call9
store i32 %add, i32* @thl_x, align 4
%tobool7.not = icmp eq i32 %dec, 0
br i1 %tobool7.not, label %while.body4.backedge, label %while.body8, !llvm.loop !4

while.end11: ; preds = %entry
%2 = load i32, i32* @thl_x, align 4
ret i32 %2
}

; // Sencond function (@_Z2f2i):
; int f2(int c) {
; thread_local struct SS st;
; c += gfunc();
; while (c--) {
; thl_x += gfunc();
; st.thl_c += (char)gfunc();
; st.num += gfunc();
; }
; return thl_x;
; }
declare noundef i32 @_Z5gfuncv() local_unnamed_addr #1

declare noundef i32 @_Z6gfunc2i(i32 noundef) local_unnamed_addr #1

; Function Attrs: mustprogress uwtable
define noundef i32 @_Z2f2i(i32 noundef %c) local_unnamed_addr #0 {
; CHECK-LABEL: _Z2f2i
; CHECK: entry:
; CHECK-NEXT: %call = tail call noundef i32 @_Z5gfuncv()
; CHECK-NEXT: %add = add nsw i32 %call, %c
; CHECK-NEXT: %tobool.not12 = icmp eq i32 %add, 0
; CHECK-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32*
; CHECK-NEXT: br i1 %tobool.not12, label %while.end, label %while.body.preheader

; CHECK: while.body.preheader:
; CHECK-NEXT: %tls_bitcast1 = bitcast i8* @_ZZ2f2iE2st.0 to i8*
; CHECK-NEXT: %tls_bitcast2 = bitcast i32* @_ZZ2f2iE2st.1 to i32*
; CHECK-NEXT: br label %while.body

; CHECK: while.body:
; CHECK-NEXT: %c.addr.013 = phi i32 [ %dec, %while.body ], [ %add, %while.body.preheader ]
; CHECK-NEXT: %dec = add i32 %c.addr.013, -1
; CHECK-NEXT: %call1 = tail call noundef i32 @_Z5gfuncv()
; CHECK-NEXT: %0 = load i32, i32* %tls_bitcast, align 4
; CHECK-NEXT: %add2 = add nsw i32 %0, %call1
; CHECK-NEXT: store i32 %add2, i32* %tls_bitcast, align 4
; CHECK-NEXT: %call3 = tail call noundef i32 @_Z5gfuncv()
; CHECK-NEXT: %1 = load i8, i8* %tls_bitcast1, align 4
; CHECK-NEXT: %2 = trunc i32 %call3 to i8
; CHECK-NEXT: %conv7 = add i8 %1, %2
; CHECK-NEXT: store i8 %conv7, i8* %tls_bitcast1, align 4
; CHECK-NEXT: %call8 = tail call noundef i32 @_Z5gfuncv()
; CHECK-NEXT: %3 = load i32, i32* %tls_bitcast2, align 4
; CHECK-NEXT: %add9 = add nsw i32 %3, %call8
; CHECK-NEXT: store i32 %add9, i32* %tls_bitcast2, align 4
; CHECK-NEXT: %tobool.not = icmp eq i32 %dec, 0
; CHECK-NEXT: br i1 %tobool.not, label %while.end.loopexit, label %while.body

; CHECK: while.end.loopexit:
; CHECK-NEXT: br label %while.end

; CHECK: while.end:
; CHECK-NEXT: %4 = load i32, i32* %tls_bitcast, align 4
; CHECK-NEXT: ret i32 %4
entry:
%call = tail call noundef i32 @_Z5gfuncv()
%add = add nsw i32 %call, %c
%tobool.not12 = icmp eq i32 %add, 0
br i1 %tobool.not12, label %while.end, label %while.body

while.body: ; preds = %entry, %while.body
%c.addr.013 = phi i32 [ %dec, %while.body ], [ %add, %entry ]
%dec = add nsw i32 %c.addr.013, -1
%call1 = tail call noundef i32 @_Z5gfuncv()
%0 = load i32, i32* @thl_x, align 4
%add2 = add nsw i32 %0, %call1
store i32 %add2, i32* @thl_x, align 4
%call3 = tail call noundef i32 @_Z5gfuncv()
%1 = load i8, i8* @_ZZ2f2iE2st.0, align 4
%2 = trunc i32 %call3 to i8
%conv7 = add i8 %1, %2
store i8 %conv7, i8* @_ZZ2f2iE2st.0, align 4
%call8 = tail call noundef i32 @_Z5gfuncv()
%3 = load i32, i32* @_ZZ2f2iE2st.1, align 4
%add9 = add nsw i32 %3, %call8
store i32 %add9, i32* @_ZZ2f2iE2st.1, align 4
%tobool.not = icmp eq i32 %dec, 0
br i1 %tobool.not, label %while.end, label %while.body

while.end: ; preds = %while.body, %entry
%4 = load i32, i32* @thl_x, align 4
ret i32 %4
}

; // Third function (@_Z2f3i):
; int f3(int c) {
; int *px = &thl_x;
; gfunc2(*px);
; gfunc2(*px);
; return 1;
; }

; Function Attrs: mustprogress uwtable
define noundef i32 @_Z2f3i(i32 noundef %c) local_unnamed_addr #0 {
; CHECK-LABEL: _Z2f3i
; CHECK: entry:
; CHECK-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32*
; CHECK-NEXT: %0 = load i32, i32* %tls_bitcast, align 4
; CHECK-NEXT: %call = tail call noundef i32 @_Z6gfunc2i(i32 noundef %0)
; CHECK-NEXT: %1 = load i32, i32* %tls_bitcast, align 4
; CHECK-NEXT: %call1 = tail call noundef i32 @_Z6gfunc2i(i32 noundef %1)
; CHECK-NEXT: ret i32 1
entry:
%0 = load i32, i32* @thl_x, align 4
%call = tail call noundef i32 @_Z6gfunc2i(i32 noundef %0)
%1 = load i32, i32* @thl_x, align 4
%call1 = tail call noundef i32 @_Z6gfunc2i(i32 noundef %1)
ret i32 1
}

; Function Attrs: uwtable
define weak_odr hidden noundef i32* @_ZTW5thl_x() local_unnamed_addr #2 comdat {
ret i32* @thl_x
}

; Function Attrs: uwtable
define weak_odr hidden noundef i32* @_ZTW6thl_x2() local_unnamed_addr #2 comdat {
ret i32* @thl_x2
}

attributes #0 = { mustprogress uwtable "tls-load-hoist" "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }

!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{i32 7, !"uwtable", i32 2}
!3 = !{!"clang version 15.0.0"}
!4 = distinct !{!4, !5}
!5 = !{!"llvm.loop.mustprogress"}
51 changes: 51 additions & 0 deletions llvm/test/CodeGen/X86/tls-loads-control2.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic --tls-load-hoist=optimize -o - %s | FileCheck %s --check-prefix=HOIST0
; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic --tls-load-hoist=non-optimize -o - %s | FileCheck %s --check-prefix=HOIST2
; RUN: opt -S -mtriple=x86_64-unknown-unknown -tlshoist --relocation-model=pic -o - %s | FileCheck %s --check-prefix=HOIST2

$_ZTW5thl_x = comdat any

@thl_x = thread_local global i32 0, align 4

; Function Attrs: mustprogress uwtable
define i32 @_Z2f1i(i32 %c) local_unnamed_addr #0 {
entry:
%0 = load i32, i32* @thl_x, align 4
%call = tail call i32 @_Z5gfunci(i32 %0)
%1 = load i32, i32* @thl_x, align 4
%call1 = tail call i32 @_Z5gfunci(i32 %1)
ret i32 1
}

;HOIST0-LABEL: _Z2f1i
;HOIST0: entry:
;HOIST0-NEXT: %tls_bitcast = bitcast i32* @thl_x to i32*
;HOIST0-NEXT: %0 = load i32, i32* %tls_bitcast, align 4
;HOIST0-NEXT: %call = tail call i32 @_Z5gfunci(i32 %0)
;HOIST0-NEXT: %1 = load i32, i32* %tls_bitcast, align 4
;HOIST0-NEXT: %call1 = tail call i32 @_Z5gfunci(i32 %1)
;HOIST0-NEXT: ret i32 1

;HOIST2-LABEL: _Z2f1i
;HOIST2: entry:
;HOIST2-NEXT: %0 = load i32, i32* @thl_x, align 4
;HOIST2-NEXT: %call = tail call i32 @_Z5gfunci(i32 %0)
;HOIST2-NEXT: %1 = load i32, i32* @thl_x, align 4
;HOIST2-NEXT: %call1 = tail call i32 @_Z5gfunci(i32 %1)
;HOIST2-NEXT: ret i32 1

declare i32 @_Z5gfunci(i32) local_unnamed_addr #1

; Function Attrs: uwtable
define weak_odr hidden i32* @_ZTW5thl_x() local_unnamed_addr #2 comdat {
ret i32* @thl_x
}

attributes #0 = { mustprogress uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }

!llvm.module.flags = !{!0, !1, !2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{i32 7, !"uwtable", i32 1}
358 changes: 358 additions & 0 deletions llvm/test/CodeGen/X86/tls-loads-control3.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,358 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-hoist=optimize -o - %s | FileCheck %s --check-prefix=HOIST0
; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic --tls-load-hoist=non-optimize -o - %s | FileCheck %s --check-prefix=HOIST2
; RUN: llc -mtriple=x86_64-unknown-unknown -O2 --relocation-model=pic -o - %s | FileCheck %s --check-prefix=HOIST2

; This test has no module flag {"tls-load-hoist", i32 0}, so use --tls-load-hoist=x
; to choose the way of loading thread_local address.

; This test come from compiling clang/test/CodeGen/intel/tls_loads.cpp with:
; (clang tls_loads.cpp -fPIC -ftls-model=global-dynamic -O2 -S -emit-llvm)

$_ZTW5thl_x = comdat any

$_ZTW6thl_x2 = comdat any

@thl_x = thread_local global i32 0, align 4
@thl_x2 = thread_local global i32 0, align 4
@_ZZ2f2iE2st.0 = internal thread_local unnamed_addr global i8 0, align 4
@_ZZ2f2iE2st.1 = internal thread_local unnamed_addr global i32 0, align 4

; For HOIST0, check call __tls_get_addr@PLT only one time for each thread_local variable.
; For HOIST2, Check the default way: usually call __tls_get_addr@PLT every time when use thread_local variable.

; Function Attrs: mustprogress uwtable
define i32 @_Z2f1i(i32 %c) local_unnamed_addr #0 {
; HOIST0-LABEL: _Z2f1i:
; HOIST0: # %bb.0: # %entry
; HOIST0-NEXT: pushq %r15
; HOIST0-NEXT: .cfi_def_cfa_offset 16
; HOIST0-NEXT: pushq %r14
; HOIST0-NEXT: .cfi_def_cfa_offset 24
; HOIST0-NEXT: pushq %rbx
; HOIST0-NEXT: .cfi_def_cfa_offset 32
; HOIST0-NEXT: .cfi_offset %rbx, -32
; HOIST0-NEXT: .cfi_offset %r14, -24
; HOIST0-NEXT: .cfi_offset %r15, -16
; HOIST0-NEXT: movl %edi, %ebx
; HOIST0-NEXT: data16
; HOIST0-NEXT: leaq thl_x@TLSGD(%rip), %rdi
; HOIST0-NEXT: data16
; HOIST0-NEXT: data16
; HOIST0-NEXT: rex64
; HOIST0-NEXT: callq __tls_get_addr@PLT
; HOIST0-NEXT: movq %rax, %r14
; HOIST0-NEXT: testl %ebx, %ebx
; HOIST0-NEXT: je .LBB0_4
; HOIST0-NEXT: # %bb.1: # %while.body.preheader
; HOIST0-NEXT: data16
; HOIST0-NEXT: leaq thl_x2@TLSGD(%rip), %rdi
; HOIST0-NEXT: data16
; HOIST0-NEXT: data16
; HOIST0-NEXT: rex64
; HOIST0-NEXT: callq __tls_get_addr@PLT
; HOIST0-NEXT: movq %rax, %r15
; HOIST0-NEXT: .p2align 4, 0x90
; HOIST0-NEXT: .LBB0_2: # %while.body
; HOIST0-NEXT: # =>This Inner Loop Header: Depth=1
; HOIST0-NEXT: movl (%r15), %edi
; HOIST0-NEXT: callq _Z6gfunc2i@PLT
; HOIST0-NEXT: addl (%r14), %eax
; HOIST0-NEXT: movl %eax, (%r14)
; HOIST0-NEXT: decl %ebx
; HOIST0-NEXT: jne .LBB0_2
; HOIST0-NEXT: jmp .LBB0_3
; HOIST0-NEXT: .LBB0_4: # %entry.while.end_crit_edge
; HOIST0-NEXT: movl (%r14), %eax
; HOIST0-NEXT: .LBB0_3: # %while.end
; HOIST0-NEXT: popq %rbx
; HOIST0-NEXT: .cfi_def_cfa_offset 24
; HOIST0-NEXT: popq %r14
; HOIST0-NEXT: .cfi_def_cfa_offset 16
; HOIST0-NEXT: popq %r15
; HOIST0-NEXT: .cfi_def_cfa_offset 8
; HOIST0-NEXT: retq
;
; HOIST2-LABEL: _Z2f1i:
; HOIST2: # %bb.0: # %entry
; HOIST2-NEXT: pushq %rbp
; HOIST2-NEXT: .cfi_def_cfa_offset 16
; HOIST2-NEXT: pushq %rbx
; HOIST2-NEXT: .cfi_def_cfa_offset 24
; HOIST2-NEXT: pushq %rax
; HOIST2-NEXT: .cfi_def_cfa_offset 32
; HOIST2-NEXT: .cfi_offset %rbx, -24
; HOIST2-NEXT: .cfi_offset %rbp, -16
; HOIST2-NEXT: testl %edi, %edi
; HOIST2-NEXT: je .LBB0_4
; HOIST2-NEXT: # %bb.1:
; HOIST2-NEXT: movl %edi, %ebx
; HOIST2-NEXT: .p2align 4, 0x90
; HOIST2-NEXT: .LBB0_2: # %while.body
; HOIST2-NEXT: # =>This Inner Loop Header: Depth=1
; HOIST2-NEXT: data16
; HOIST2-NEXT: leaq thl_x2@TLSGD(%rip), %rdi
; HOIST2-NEXT: data16
; HOIST2-NEXT: data16
; HOIST2-NEXT: rex64
; HOIST2-NEXT: callq __tls_get_addr@PLT
; HOIST2-NEXT: movl (%rax), %edi
; HOIST2-NEXT: callq _Z6gfunc2i@PLT
; HOIST2-NEXT: movl %eax, %ebp
; HOIST2-NEXT: data16
; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi
; HOIST2-NEXT: data16
; HOIST2-NEXT: data16
; HOIST2-NEXT: rex64
; HOIST2-NEXT: callq __tls_get_addr@PLT
; HOIST2-NEXT: addl (%rax), %ebp
; HOIST2-NEXT: movl %ebp, (%rax)
; HOIST2-NEXT: decl %ebx
; HOIST2-NEXT: jne .LBB0_2
; HOIST2-NEXT: jmp .LBB0_3
; HOIST2-NEXT: .LBB0_4: # %entry.while.end_crit_edge
; HOIST2-NEXT: data16
; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi
; HOIST2-NEXT: data16
; HOIST2-NEXT: data16
; HOIST2-NEXT: rex64
; HOIST2-NEXT: callq __tls_get_addr@PLT
; HOIST2-NEXT: movl (%rax), %ebp
; HOIST2-NEXT: .LBB0_3: # %while.end
; HOIST2-NEXT: movl %ebp, %eax
; HOIST2-NEXT: addq $8, %rsp
; HOIST2-NEXT: .cfi_def_cfa_offset 24
; HOIST2-NEXT: popq %rbx
; HOIST2-NEXT: .cfi_def_cfa_offset 16
; HOIST2-NEXT: popq %rbp
; HOIST2-NEXT: .cfi_def_cfa_offset 8
; HOIST2-NEXT: retq
entry:
%tobool.not3 = icmp eq i32 %c, 0
br i1 %tobool.not3, label %entry.while.end_crit_edge, label %while.body

entry.while.end_crit_edge: ; preds = %entry
%.pre = load i32, i32* @thl_x, align 4
br label %while.end

while.body: ; preds = %entry, %while.body
%c.addr.04 = phi i32 [ %dec, %while.body ], [ %c, %entry ]
%dec = add nsw i32 %c.addr.04, -1
%0 = load i32, i32* @thl_x2, align 4
%call = tail call i32 @_Z6gfunc2i(i32 %0)
%1 = load i32, i32* @thl_x, align 4
%add = add nsw i32 %1, %call
store i32 %add, i32* @thl_x, align 4
%tobool.not = icmp eq i32 %dec, 0
br i1 %tobool.not, label %while.end, label %while.body

while.end: ; preds = %while.body, %entry.while.end_crit_edge
%2 = phi i32 [ %.pre, %entry.while.end_crit_edge ], [ %add, %while.body ]
ret i32 %2
}

declare i32 @_Z6gfunc2i(i32) local_unnamed_addr #1

; Function Attrs: mustprogress uwtable
define i32 @_Z2f2i(i32 %c) local_unnamed_addr #0 {
; HOIST0-LABEL: _Z2f2i:
; HOIST0: # %bb.0: # %entry
; HOIST0-NEXT: pushq %r15
; HOIST0-NEXT: .cfi_def_cfa_offset 16
; HOIST0-NEXT: pushq %r14
; HOIST0-NEXT: .cfi_def_cfa_offset 24
; HOIST0-NEXT: pushq %r12
; HOIST0-NEXT: .cfi_def_cfa_offset 32
; HOIST0-NEXT: pushq %rbx
; HOIST0-NEXT: .cfi_def_cfa_offset 40
; HOIST0-NEXT: pushq %rax
; HOIST0-NEXT: .cfi_def_cfa_offset 48
; HOIST0-NEXT: .cfi_offset %rbx, -40
; HOIST0-NEXT: .cfi_offset %r12, -32
; HOIST0-NEXT: .cfi_offset %r14, -24
; HOIST0-NEXT: .cfi_offset %r15, -16
; HOIST0-NEXT: movl %edi, %ebx
; HOIST0-NEXT: data16
; HOIST0-NEXT: leaq thl_x@TLSGD(%rip), %rdi
; HOIST0-NEXT: data16
; HOIST0-NEXT: data16
; HOIST0-NEXT: rex64
; HOIST0-NEXT: callq __tls_get_addr@PLT
; HOIST0-NEXT: movq %rax, %r14
; HOIST0-NEXT: testl %ebx, %ebx
; HOIST0-NEXT: je .LBB1_3
; HOIST0-NEXT: # %bb.1: # %while.body.preheader
; HOIST0-NEXT: leaq _ZZ2f2iE2st.0@TLSLD(%rip), %rdi
; HOIST0-NEXT: callq __tls_get_addr@PLT
; HOIST0-NEXT: movq %rax, %rcx
; HOIST0-NEXT: leaq _ZZ2f2iE2st.0@DTPOFF(%rax), %r15
; HOIST0-NEXT: leaq _ZZ2f2iE2st.1@DTPOFF(%rax), %r12
; HOIST0-NEXT: .p2align 4, 0x90
; HOIST0-NEXT: .LBB1_2: # %while.body
; HOIST0-NEXT: # =>This Inner Loop Header: Depth=1
; HOIST0-NEXT: callq _Z5gfuncv@PLT
; HOIST0-NEXT: addl %eax, (%r14)
; HOIST0-NEXT: callq _Z5gfuncv@PLT
; HOIST0-NEXT: addb %al, (%r15)
; HOIST0-NEXT: callq _Z5gfuncv@PLT
; HOIST0-NEXT: addl %eax, (%r12)
; HOIST0-NEXT: decl %ebx
; HOIST0-NEXT: jne .LBB1_2
; HOIST0-NEXT: .LBB1_3: # %while.end
; HOIST0-NEXT: movl (%r14), %eax
; HOIST0-NEXT: addq $8, %rsp
; HOIST0-NEXT: .cfi_def_cfa_offset 40
; HOIST0-NEXT: popq %rbx
; HOIST0-NEXT: .cfi_def_cfa_offset 32
; HOIST0-NEXT: popq %r12
; HOIST0-NEXT: .cfi_def_cfa_offset 24
; HOIST0-NEXT: popq %r14
; HOIST0-NEXT: .cfi_def_cfa_offset 16
; HOIST0-NEXT: popq %r15
; HOIST0-NEXT: .cfi_def_cfa_offset 8
; HOIST0-NEXT: retq
;
; HOIST2-LABEL: _Z2f2i:
; HOIST2: # %bb.0: # %entry
; HOIST2-NEXT: pushq %rbp
; HOIST2-NEXT: .cfi_def_cfa_offset 16
; HOIST2-NEXT: pushq %r14
; HOIST2-NEXT: .cfi_def_cfa_offset 24
; HOIST2-NEXT: pushq %rbx
; HOIST2-NEXT: .cfi_def_cfa_offset 32
; HOIST2-NEXT: .cfi_offset %rbx, -32
; HOIST2-NEXT: .cfi_offset %r14, -24
; HOIST2-NEXT: .cfi_offset %rbp, -16
; HOIST2-NEXT: testl %edi, %edi
; HOIST2-NEXT: je .LBB1_3
; HOIST2-NEXT: # %bb.1: # %while.body.preheader
; HOIST2-NEXT: movl %edi, %ebx
; HOIST2-NEXT: .p2align 4, 0x90
; HOIST2-NEXT: .LBB1_2: # %while.body
; HOIST2-NEXT: # =>This Inner Loop Header: Depth=1
; HOIST2-NEXT: callq _Z5gfuncv@PLT
; HOIST2-NEXT: movl %eax, %ebp
; HOIST2-NEXT: data16
; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi
; HOIST2-NEXT: data16
; HOIST2-NEXT: data16
; HOIST2-NEXT: rex64
; HOIST2-NEXT: callq __tls_get_addr@PLT
; HOIST2-NEXT: addl %ebp, (%rax)
; HOIST2-NEXT: callq _Z5gfuncv@PLT
; HOIST2-NEXT: movl %eax, %ebp
; HOIST2-NEXT: leaq _ZZ2f2iE2st.0@TLSLD(%rip), %rdi
; HOIST2-NEXT: callq __tls_get_addr@PLT
; HOIST2-NEXT: movq %rax, %r14
; HOIST2-NEXT: addb %bpl, _ZZ2f2iE2st.0@DTPOFF(%rax)
; HOIST2-NEXT: callq _Z5gfuncv@PLT
; HOIST2-NEXT: movl %eax, %ecx
; HOIST2-NEXT: movq %r14, %rax
; HOIST2-NEXT: addl %ecx, _ZZ2f2iE2st.1@DTPOFF(%r14)
; HOIST2-NEXT: decl %ebx
; HOIST2-NEXT: jne .LBB1_2
; HOIST2-NEXT: .LBB1_3: # %while.end
; HOIST2-NEXT: data16
; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi
; HOIST2-NEXT: data16
; HOIST2-NEXT: data16
; HOIST2-NEXT: rex64
; HOIST2-NEXT: callq __tls_get_addr@PLT
; HOIST2-NEXT: movl (%rax), %eax
; HOIST2-NEXT: popq %rbx
; HOIST2-NEXT: .cfi_def_cfa_offset 24
; HOIST2-NEXT: popq %r14
; HOIST2-NEXT: .cfi_def_cfa_offset 16
; HOIST2-NEXT: popq %rbp
; HOIST2-NEXT: .cfi_def_cfa_offset 8
; HOIST2-NEXT: retq
entry:
%tobool.not9 = icmp eq i32 %c, 0
br i1 %tobool.not9, label %while.end, label %while.body

while.body: ; preds = %entry, %while.body
%c.addr.010 = phi i32 [ %dec, %while.body ], [ %c, %entry ]
%dec = add nsw i32 %c.addr.010, -1
%call = tail call i32 @_Z5gfuncv()
%0 = load i32, i32* @thl_x, align 4
%add = add nsw i32 %0, %call
store i32 %add, i32* @thl_x, align 4
%call1 = tail call i32 @_Z5gfuncv()
%1 = load i8, i8* @_ZZ2f2iE2st.0, align 4
%2 = trunc i32 %call1 to i8
%conv5 = add i8 %1, %2
store i8 %conv5, i8* @_ZZ2f2iE2st.0, align 4
%call6 = tail call i32 @_Z5gfuncv()
%3 = load i32, i32* @_ZZ2f2iE2st.1, align 4
%add7 = add nsw i32 %3, %call6
store i32 %add7, i32* @_ZZ2f2iE2st.1, align 4
%tobool.not = icmp eq i32 %dec, 0
br i1 %tobool.not, label %while.end, label %while.body

while.end: ; preds = %while.body, %entry
%4 = load i32, i32* @thl_x, align 4
ret i32 %4
}

declare i32 @_Z5gfuncv() local_unnamed_addr #1

; Function Attrs: mustprogress uwtable
define i32 @_Z2f3i(i32 %c) local_unnamed_addr #0 {
; HOIST0-LABEL: _Z2f3i:
; HOIST0: # %bb.0: # %entry
; HOIST0-NEXT: pushq %rbx
; HOIST0-NEXT: .cfi_def_cfa_offset 16
; HOIST0-NEXT: .cfi_offset %rbx, -16
; HOIST0-NEXT: data16
; HOIST0-NEXT: leaq thl_x@TLSGD(%rip), %rdi
; HOIST0-NEXT: data16
; HOIST0-NEXT: data16
; HOIST0-NEXT: rex64
; HOIST0-NEXT: callq __tls_get_addr@PLT
; HOIST0-NEXT: movq %rax, %rbx
; HOIST0-NEXT: movl (%rax), %edi
; HOIST0-NEXT: callq _Z6gfunc2i@PLT
; HOIST0-NEXT: movl (%rbx), %edi
; HOIST0-NEXT: callq _Z6gfunc2i@PLT
; HOIST0-NEXT: movl $1, %eax
; HOIST0-NEXT: popq %rbx
; HOIST0-NEXT: .cfi_def_cfa_offset 8
; HOIST0-NEXT: retq
;
; HOIST2-LABEL: _Z2f3i:
; HOIST2: # %bb.0: # %entry
; HOIST2-NEXT: pushq %rbx
; HOIST2-NEXT: .cfi_def_cfa_offset 16
; HOIST2-NEXT: .cfi_offset %rbx, -16
; HOIST2-NEXT: data16
; HOIST2-NEXT: leaq thl_x@TLSGD(%rip), %rdi
; HOIST2-NEXT: data16
; HOIST2-NEXT: data16
; HOIST2-NEXT: rex64
; HOIST2-NEXT: callq __tls_get_addr@PLT
; HOIST2-NEXT: movq %rax, %rbx
; HOIST2-NEXT: movl (%rax), %edi
; HOIST2-NEXT: callq _Z6gfunc2i@PLT
; HOIST2-NEXT: movl (%rbx), %edi
; HOIST2-NEXT: callq _Z6gfunc2i@PLT
; HOIST2-NEXT: movl $1, %eax
; HOIST2-NEXT: popq %rbx
; HOIST2-NEXT: .cfi_def_cfa_offset 8
; HOIST2-NEXT: retq
entry:
%0 = load i32, i32* @thl_x, align 4
%call = tail call i32 @_Z6gfunc2i(i32 %0)
%1 = load i32, i32* @thl_x, align 4
%call1 = tail call i32 @_Z6gfunc2i(i32 %1)
ret i32 1
}

attributes #0 = { nounwind mustprogress uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #2 = { uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }

!llvm.module.flags = !{!0, !1, !2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{i32 7, !"uwtable", i32 1}
1 change: 1 addition & 0 deletions llvm/tools/llc/llc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,7 @@ int main(int argc, char **argv) {
initializeHardwareLoopsPass(*Registry);
initializeTransformUtils(*Registry);
initializeReplaceWithVeclibLegacyPass(*Registry);
initializeTLSVariableHoistLegacyPassPass(*Registry);

// Initialize debugging passes.
initializeScavengerTestPass(*Registry);
Expand Down