278 changes: 19 additions & 259 deletions lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
#include "lldb/Utility/LLDBLog.h"
#include "lldb/Utility/Log.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/RecordLayout.h"
#include "clang/Basic/SourceManager.h"

#include "Plugins/ExpressionParser/Clang/ClangUtil.h"
Expand Down Expand Up @@ -705,56 +704,6 @@ void ClangASTSource::FillNamespaceMap(
}
}

template <class D> class TaggedASTDecl {
public:
TaggedASTDecl() : decl(nullptr) {}
TaggedASTDecl(D *_decl) : decl(_decl) {}
bool IsValid() const { return (decl != nullptr); }
bool IsInvalid() const { return !IsValid(); }
D *operator->() const { return decl; }
D *decl;
};

template <class D2, template <class D> class TD, class D1>
TD<D2> DynCast(TD<D1> source) {
return TD<D2>(dyn_cast<D2>(source.decl));
}

template <class D = Decl> class DeclFromParser;
template <class D = Decl> class DeclFromUser;

template <class D> class DeclFromParser : public TaggedASTDecl<D> {
public:
DeclFromParser() : TaggedASTDecl<D>() {}
DeclFromParser(D *_decl) : TaggedASTDecl<D>(_decl) {}

DeclFromUser<D> GetOrigin(ClangASTSource &source);
};

template <class D> class DeclFromUser : public TaggedASTDecl<D> {
public:
DeclFromUser() : TaggedASTDecl<D>() {}
DeclFromUser(D *_decl) : TaggedASTDecl<D>(_decl) {}

DeclFromParser<D> Import(ClangASTSource &source);
};

template <class D>
DeclFromUser<D> DeclFromParser<D>::GetOrigin(ClangASTSource &source) {
ClangASTImporter::DeclOrigin origin = source.GetDeclOrigin(this->decl);
if (!origin.Valid())
return DeclFromUser<D>();
return DeclFromUser<D>(dyn_cast<D>(origin.decl));
}

template <class D>
DeclFromParser<D> DeclFromUser<D>::Import(ClangASTSource &source) {
DeclFromParser<> parser_generic_decl(source.CopyDecl(this->decl));
if (parser_generic_decl.IsInvalid())
return DeclFromParser<D>();
return DeclFromParser<D>(dyn_cast<D>(parser_generic_decl.decl));
}

bool ClangASTSource::FindObjCMethodDeclsWithOrigin(
NameSearchContext &context, ObjCInterfaceDecl *original_interface_decl,
const char *log_info) {
Expand Down Expand Up @@ -1188,8 +1137,8 @@ void ClangASTSource::FindObjCMethodDecls(NameSearchContext &context) {
} while (false);
}

static bool FindObjCPropertyAndIvarDeclsWithOrigin(
NameSearchContext &context, ClangASTSource &source,
bool ClangASTSource::FindObjCPropertyAndIvarDeclsWithOrigin(
NameSearchContext &context,
DeclFromUser<const ObjCInterfaceDecl> &origin_iface_decl) {
Log *log = GetLog(LLDBLog::Expressions);

Expand All @@ -1209,7 +1158,7 @@ static bool FindObjCPropertyAndIvarDeclsWithOrigin(

if (origin_property_decl.IsValid()) {
DeclFromParser<ObjCPropertyDecl> parser_property_decl(
origin_property_decl.Import(source));
origin_property_decl.Import(m_ast_context, *m_ast_importer_sp));
if (parser_property_decl.IsValid()) {
LLDB_LOG(log, " CAS::FOPD found\n{0}",
ClangUtil::DumpDecl(parser_property_decl.decl));
Expand All @@ -1224,7 +1173,7 @@ static bool FindObjCPropertyAndIvarDeclsWithOrigin(

if (origin_ivar_decl.IsValid()) {
DeclFromParser<ObjCIvarDecl> parser_ivar_decl(
origin_ivar_decl.Import(source));
origin_ivar_decl.Import(m_ast_context, *m_ast_importer_sp));
if (parser_ivar_decl.IsValid()) {
LLDB_LOG(log, " CAS::FOPD found\n{0}",
ClangUtil::DumpDecl(parser_ivar_decl.decl));
Expand All @@ -1243,7 +1192,7 @@ void ClangASTSource::FindObjCPropertyAndIvarDecls(NameSearchContext &context) {
DeclFromParser<const ObjCInterfaceDecl> parser_iface_decl(
cast<ObjCInterfaceDecl>(context.m_decl_context));
DeclFromUser<const ObjCInterfaceDecl> origin_iface_decl(
parser_iface_decl.GetOrigin(*this));
parser_iface_decl.GetOrigin(*m_ast_importer_sp));

ConstString class_name(parser_iface_decl->getNameAsString().c_str());

Expand All @@ -1253,7 +1202,7 @@ void ClangASTSource::FindObjCPropertyAndIvarDecls(NameSearchContext &context) {
m_ast_context, m_clang_ast_context->getDisplayName(),
parser_iface_decl->getName(), context.m_decl_name.getAsString());

if (FindObjCPropertyAndIvarDeclsWithOrigin(context, *this, origin_iface_decl))
if (FindObjCPropertyAndIvarDeclsWithOrigin(context, origin_iface_decl))
return;

LLDB_LOG(log,
Expand Down Expand Up @@ -1286,7 +1235,7 @@ void ClangASTSource::FindObjCPropertyAndIvarDecls(NameSearchContext &context) {
"(ObjCInterfaceDecl*){0}/(ASTContext*){1}...",
complete_iface_decl.decl, &complete_iface_decl->getASTContext());

FindObjCPropertyAndIvarDeclsWithOrigin(context, *this, complete_iface_decl);
FindObjCPropertyAndIvarDeclsWithOrigin(context, complete_iface_decl);

return;
} while (false);
Expand Down Expand Up @@ -1320,7 +1269,7 @@ void ClangASTSource::FindObjCPropertyAndIvarDecls(NameSearchContext &context) {
interface_decl_from_modules.decl,
&interface_decl_from_modules->getASTContext());

if (FindObjCPropertyAndIvarDeclsWithOrigin(context, *this,
if (FindObjCPropertyAndIvarDeclsWithOrigin(context,
interface_decl_from_modules))
return;
} while (false);
Expand Down Expand Up @@ -1364,7 +1313,7 @@ void ClangASTSource::FindObjCPropertyAndIvarDecls(NameSearchContext &context) {
interface_decl_from_runtime.decl,
&interface_decl_from_runtime->getASTContext());

if (FindObjCPropertyAndIvarDeclsWithOrigin(context, *this,
if (FindObjCPropertyAndIvarDeclsWithOrigin(context,
interface_decl_from_runtime))
return;
} while (false);
Expand Down Expand Up @@ -1395,205 +1344,16 @@ void ClangASTSource::LookupInNamespace(NameSearchContext &context) {
}
}

typedef llvm::DenseMap<const FieldDecl *, uint64_t> FieldOffsetMap;
typedef llvm::DenseMap<const CXXRecordDecl *, CharUnits> BaseOffsetMap;

template <class D, class O>
static bool ImportOffsetMap(llvm::DenseMap<const D *, O> &destination_map,
llvm::DenseMap<const D *, O> &source_map,
ClangASTSource &source) {
// When importing fields into a new record, clang has a hard requirement that
// fields be imported in field offset order. Since they are stored in a
// DenseMap with a pointer as the key type, this means we cannot simply
// iterate over the map, as the order will be non-deterministic. Instead we
// have to sort by the offset and then insert in sorted order.
typedef llvm::DenseMap<const D *, O> MapType;
typedef typename MapType::value_type PairType;
std::vector<PairType> sorted_items;
sorted_items.reserve(source_map.size());
sorted_items.assign(source_map.begin(), source_map.end());
llvm::sort(sorted_items, llvm::less_second());

for (const auto &item : sorted_items) {
DeclFromUser<D> user_decl(const_cast<D *>(item.first));
DeclFromParser<D> parser_decl(user_decl.Import(source));
if (parser_decl.IsInvalid())
return false;
destination_map.insert(
std::pair<const D *, O>(parser_decl.decl, item.second));
}

return true;
}

template <bool IsVirtual>
bool ExtractBaseOffsets(const ASTRecordLayout &record_layout,
DeclFromUser<const CXXRecordDecl> &record,
BaseOffsetMap &base_offsets) {
for (CXXRecordDecl::base_class_const_iterator
bi = (IsVirtual ? record->vbases_begin() : record->bases_begin()),
be = (IsVirtual ? record->vbases_end() : record->bases_end());
bi != be; ++bi) {
if (!IsVirtual && bi->isVirtual())
continue;

const clang::Type *origin_base_type = bi->getType().getTypePtr();
const clang::RecordType *origin_base_record_type =
origin_base_type->getAs<RecordType>();

if (!origin_base_record_type)
return false;

DeclFromUser<RecordDecl> origin_base_record(
origin_base_record_type->getDecl());

if (origin_base_record.IsInvalid())
return false;

DeclFromUser<CXXRecordDecl> origin_base_cxx_record(
DynCast<CXXRecordDecl>(origin_base_record));

if (origin_base_cxx_record.IsInvalid())
return false;

CharUnits base_offset;

if (IsVirtual)
base_offset =
record_layout.getVBaseClassOffset(origin_base_cxx_record.decl);
else
base_offset =
record_layout.getBaseClassOffset(origin_base_cxx_record.decl);

base_offsets.insert(std::pair<const CXXRecordDecl *, CharUnits>(
origin_base_cxx_record.decl, base_offset));
}

return true;
}

bool ClangASTSource::layoutRecordType(const RecordDecl *record, uint64_t &size,
uint64_t &alignment,
FieldOffsetMap &field_offsets,
BaseOffsetMap &base_offsets,
BaseOffsetMap &virtual_base_offsets) {

Log *log = GetLog(LLDBLog::Expressions);

LLDB_LOG(log,
"LayoutRecordType on (ASTContext*){0} '{1}' for (RecordDecl*)"
"{2} [name = '{3}']",
m_ast_context, m_clang_ast_context->getDisplayName(), record,
record->getName());

DeclFromParser<const RecordDecl> parser_record(record);
DeclFromUser<const RecordDecl> origin_record(
parser_record.GetOrigin(*this));

if (origin_record.IsInvalid())
return false;

FieldOffsetMap origin_field_offsets;
BaseOffsetMap origin_base_offsets;
BaseOffsetMap origin_virtual_base_offsets;

TypeSystemClang::GetCompleteDecl(
&origin_record->getASTContext(),
const_cast<RecordDecl *>(origin_record.decl));

clang::RecordDecl *definition = origin_record.decl->getDefinition();
if (!definition || !definition->isCompleteDefinition())
return false;

const ASTRecordLayout &record_layout(
origin_record->getASTContext().getASTRecordLayout(origin_record.decl));

int field_idx = 0, field_count = record_layout.getFieldCount();

for (RecordDecl::field_iterator fi = origin_record->field_begin(),
fe = origin_record->field_end();
fi != fe; ++fi) {
if (field_idx >= field_count)
return false; // Layout didn't go well. Bail out.

uint64_t field_offset = record_layout.getFieldOffset(field_idx);

origin_field_offsets.insert(
std::pair<const FieldDecl *, uint64_t>(*fi, field_offset));

field_idx++;
}

lldbassert(&record->getASTContext() == m_ast_context);

DeclFromUser<const CXXRecordDecl> origin_cxx_record(
DynCast<const CXXRecordDecl>(origin_record));

if (origin_cxx_record.IsValid()) {
if (!ExtractBaseOffsets<false>(record_layout, origin_cxx_record,
origin_base_offsets) ||
!ExtractBaseOffsets<true>(record_layout, origin_cxx_record,
origin_virtual_base_offsets))
return false;
}

if (!ImportOffsetMap(field_offsets, origin_field_offsets, *this) ||
!ImportOffsetMap(base_offsets, origin_base_offsets, *this) ||
!ImportOffsetMap(virtual_base_offsets, origin_virtual_base_offsets,
*this))
return false;

size = record_layout.getSize().getQuantity() * m_ast_context->getCharWidth();
alignment = record_layout.getAlignment().getQuantity() *
m_ast_context->getCharWidth();

if (log) {
LLDB_LOG(log, "LRT returned:");
LLDB_LOG(log, "LRT Original = (RecordDecl*){0}",
static_cast<const void *>(origin_record.decl));
LLDB_LOG(log, "LRT Size = {0}", size);
LLDB_LOG(log, "LRT Alignment = {0}", alignment);
LLDB_LOG(log, "LRT Fields:");
for (RecordDecl::field_iterator fi = record->field_begin(),
fe = record->field_end();
fi != fe; ++fi) {
LLDB_LOG(log,
"LRT (FieldDecl*){0}, Name = '{1}', Type = '{2}', Offset = "
"{3} bits",
*fi, fi->getName(), fi->getType().getAsString(),
field_offsets[*fi]);
}
DeclFromParser<const CXXRecordDecl> parser_cxx_record =
DynCast<const CXXRecordDecl>(parser_record);
if (parser_cxx_record.IsValid()) {
LLDB_LOG(log, "LRT Bases:");
for (CXXRecordDecl::base_class_const_iterator
bi = parser_cxx_record->bases_begin(),
be = parser_cxx_record->bases_end();
bi != be; ++bi) {
bool is_virtual = bi->isVirtual();

QualType base_type = bi->getType();
const RecordType *base_record_type = base_type->getAs<RecordType>();
DeclFromParser<RecordDecl> base_record(base_record_type->getDecl());
DeclFromParser<CXXRecordDecl> base_cxx_record =
DynCast<CXXRecordDecl>(base_record);

LLDB_LOG(log,
"LRT {0}(CXXRecordDecl*){1}, Name = '{2}', Offset = "
"{3} chars",
(is_virtual ? "Virtual " : ""), base_cxx_record.decl,
base_cxx_record.decl->getName(),
(is_virtual
? virtual_base_offsets[base_cxx_record.decl].getQuantity()
: base_offsets[base_cxx_record.decl].getQuantity()));
}
} else {
LLDB_LOG(log, "LRD Not a CXXRecord, so no bases");
}
}

return true;
bool ClangASTSource::layoutRecordType(
const RecordDecl *record, uint64_t &size, uint64_t &alignment,
llvm::DenseMap<const clang::FieldDecl *, uint64_t> &field_offsets,
llvm::DenseMap<const clang::CXXRecordDecl *, clang::CharUnits>
&base_offsets,
llvm::DenseMap<const clang::CXXRecordDecl *, clang::CharUnits>
&virtual_base_offsets) {
return m_ast_importer_sp->importRecordLayoutFromOrigin(
record, size, alignment, field_offsets, base_offsets,
virtual_base_offsets);
}

void ClangASTSource::CompleteNamespaceMap(
Expand Down
5 changes: 5 additions & 0 deletions lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,11 @@ class ClangASTSource : public clang::ExternalASTSource,
/// ExternalASTSource.
TypeSystemClang *GetTypeSystem() const { return m_clang_ast_context; }

private:
bool FindObjCPropertyAndIvarDeclsWithOrigin(
NameSearchContext &context,
DeclFromUser<const clang::ObjCInterfaceDecl> &origin_iface_decl);

protected:
bool FindObjCMethodDeclsWithOrigin(
NameSearchContext &context,
Expand Down
4 changes: 4 additions & 0 deletions lldb/test/API/lang/cpp/gmodules/alignment/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
PCH_CXX_SOURCE = pch.h
CXX_SOURCES = main.cpp

include Makefile.rules
60 changes: 60 additions & 0 deletions lldb/test/API/lang/cpp/gmodules/alignment/TestPchAlignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Tests that we correctly track AST layout info
(specifically alignment) when moving AST nodes
between ClangASTImporter instances (in this case,
from pch to executable to expression AST).
"""

import lldb
import os
from lldbsuite.test.decorators import *
from lldbsuite.test.lldbtest import *
from lldbsuite.test import lldbutil


class TestPchAlignment(TestBase):
@add_test_categories(["gmodules"])
def test_expr(self):
self.build()
lldbutil.run_to_source_breakpoint(
self, "return data", lldb.SBFileSpec("main.cpp")
)

self.expect(
"frame variable data",
substrs=["row = 1", "col = 2", "row = 3", "col = 4", "stride = 5"],
)

@add_test_categories(["gmodules"])
def test_frame_var(self):
self.build()
lldbutil.run_to_source_breakpoint(
self, "return data", lldb.SBFileSpec("main.cpp")
)

self.expect_expr(
"data",
result_type="MatrixData",
result_children=[
ValueCheck(
name="section",
children=[
ValueCheck(
name="origin",
children=[
ValueCheck(name="row", value="1"),
ValueCheck(name="col", value="2"),
],
),
ValueCheck(
name="size",
children=[
ValueCheck(name="row", value="3"),
ValueCheck(name="col", value="4"),
],
),
],
),
ValueCheck(name="stride", value="5"),
],
)
10 changes: 10 additions & 0 deletions lldb/test/API/lang/cpp/gmodules/alignment/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
int main(int argc, const char *argv[]) {
struct MatrixData data = {0};
data.section.origin.row = 1;
data.section.origin.col = 2;
data.section.size.row = 3;
data.section.size.col = 4;
data.stride = 5;

return data.section.size.row;
}
21 changes: 21 additions & 0 deletions lldb/test/API/lang/cpp/gmodules/alignment/pch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#ifndef PCH_H_IN
#define PCH_H_IN

static const int kAlignment = 64;

struct [[gnu::aligned(kAlignment)]] RowCol {
unsigned row;
unsigned col;
};

struct [[gnu::aligned(kAlignment)]] Submatrix {
struct RowCol origin;
struct RowCol size;
};

struct [[gnu::aligned(kAlignment)]] MatrixData {
struct Submatrix section;
unsigned stride;
};

#endif // _H_IN
12 changes: 6 additions & 6 deletions lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_termination(self):
self.dap_server.request_disconnect()

# Wait until the underlying lldb-dap process dies.
self.dap_server.process.wait(timeout=10)
self.dap_server.process.wait(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval)

# Check the return code
self.assertEqual(self.dap_server.process.poll(), 0)
Expand Down Expand Up @@ -334,14 +334,14 @@ def test_commands(self):
# Get output from the console. This should contain both the
# "stopCommands" that were run after the first breakpoint was hit
self.continue_to_breakpoints(breakpoint_ids)
output = self.get_console(timeout=1.0)
output = self.get_console(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval)
self.verify_commands("stopCommands", output, stopCommands)

# Continue again and hit the second breakpoint.
# Get output from the console. This should contain both the
# "stopCommands" that were run after the second breakpoint was hit
self.continue_to_breakpoints(breakpoint_ids)
output = self.get_console(timeout=1.0)
output = self.get_console(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval)
self.verify_commands("stopCommands", output, stopCommands)

# Continue until the program exits
Expand Down Expand Up @@ -402,21 +402,21 @@ def test_extra_launch_commands(self):
self.verify_commands("launchCommands", output, launchCommands)
# Verify the "stopCommands" here
self.continue_to_next_stop()
output = self.get_console(timeout=1.0)
output = self.get_console(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval)
self.verify_commands("stopCommands", output, stopCommands)

# Continue and hit the second breakpoint.
# Get output from the console. This should contain both the
# "stopCommands" that were run after the first breakpoint was hit
self.continue_to_next_stop()
output = self.get_console(timeout=1.0)
output = self.get_console(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval)
self.verify_commands("stopCommands", output, stopCommands)

# Continue until the program exits
self.continue_to_exit()
# Get output from the console. This should contain both the
# "exitCommands" that were run after the second breakpoint was hit
output = self.get_console(timeout=1.0)
output = self.get_console(timeout=lldbdap_testcase.DAPTestCaseBase.timeoutval)
self.verify_commands("exitCommands", output, exitCommands)

@skipIfWindows
Expand Down
11 changes: 11 additions & 0 deletions llvm/docs/CodeOfConduct.rst
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,17 @@ events as part of each events' information. In person reports will still be
kept confidential exactly as above, but also feel free to (anonymously if
needed) email conduct@llvm.org.

Bans
====

The code of conduct committee may decide to ban an individual from the
community for violating the code of conduct. The goal of a ban is to protect
community members from having to interact with people who are consistently not
respecting the code of conduct. Please refer to the
:doc:`Developer Policy<DeveloperPolicy>` section on Bans for how to handle
interactions with former community members. If you need further guidance,
please contact conduct@llvm.org.

Code of Conduct Committee
=========================

Expand Down
21 changes: 17 additions & 4 deletions llvm/lib/Passes/PassBuilderPipelines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,15 @@ static cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false),
cl::Hidden,
cl::desc("Enable the LoopFlatten Pass"));

// Experimentally allow loop header duplication. This should allow for better
// optimization at Oz, since loop-idiom recognition can then recognize things
// like memcpy. If this ends up being useful for many targets, we should drop
// this flag and make a code generation option that can be controlled
// independent of the opt level and exposed through the frontend.
static cl::opt<bool> EnableLoopHeaderDuplication(
"enable-loop-header-duplication", cl::init(false), cl::Hidden,
cl::desc("Enable loop header duplication at any optimization level"));

static cl::opt<bool>
EnableDFAJumpThreading("enable-dfa-jump-thread",
cl::desc("Enable DFA jump threading"),
Expand Down Expand Up @@ -630,8 +639,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
/*AllowSpeculation=*/false));

// Disable header duplication in loop rotation at -Oz.
LPM1.addPass(
LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase)));
LPM1.addPass(LoopRotatePass(EnableLoopHeaderDuplication ||
Level != OptimizationLevel::Oz,
isLTOPreLink(Phase)));
// TODO: Investigate promotion cap for O1.
LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
/*AllowSpeculation=*/true));
Expand Down Expand Up @@ -812,7 +822,8 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
// Disable header duplication in loop rotation at -Oz.
MPM.addPass(createModuleToFunctionPassAdaptor(
createFunctionToLoopPassAdaptor(
LoopRotatePass(Level != OptimizationLevel::Oz),
LoopRotatePass(EnableLoopHeaderDuplication ||
Level != OptimizationLevel::Oz),
/*UseMemorySSA=*/false,
/*UseBlockFrequencyInfo=*/false),
PTO.EagerlyInvalidateAnalyses));
Expand Down Expand Up @@ -1422,7 +1433,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
LoopPassManager LPM;
// First rotate loops that may have been un-rotated by prior passes.
// Disable header duplication at -Oz.
LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink));
LPM.addPass(LoopRotatePass(EnableLoopHeaderDuplication ||
Level != OptimizationLevel::Oz,
LTOPreLink));
// Some loops may have become dead by now. Try to delete them.
// FIXME: see discussion in https://reviews.llvm.org/D112851,
// this may need to be revisited once we run GVN before loop deletion
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3089,10 +3089,10 @@ SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
assert(ResultVT == Arg.getValueType());

auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
auto ShiftVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, ShiftVal);
NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
}

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/Hexagon/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ add_llvm_target(HexagonCodeGen
HexagonISelDAGToDAGHVX.cpp
HexagonISelLowering.cpp
HexagonISelLoweringHVX.cpp
HexagonLoopAlign.cpp
HexagonLoopIdiomRecognition.cpp
HexagonMachineFunctionInfo.cpp
HexagonMachineScheduler.cpp
Expand Down
216 changes: 216 additions & 0 deletions llvm/lib/Target/Hexagon/HexagonLoopAlign.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
//===----- HexagonLoopAlign.cpp - Generate loop alignment directives -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Inspect a basic block and if its single basic block loop with a small
// number of instructions, set the prefLoopAlignment to 32 bytes (5).
//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "hexagon-loop-align"

#include "HexagonTargetMachine.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/SchedulerRegistry.h"
#include "llvm/Support/Debug.h"

using namespace llvm;

static cl::opt<bool>
DisableLoopAlign("disable-hexagon-loop-align", cl::Hidden,
cl::desc("Disable Hexagon loop alignment pass"));

static cl::opt<uint32_t> HVXLoopAlignLimitUB(
"hexagon-hvx-loop-align-limit-ub", cl::Hidden, cl::init(16),
cl::desc("Set hexagon hvx loop upper bound align limit"));

static cl::opt<uint32_t> TinyLoopAlignLimitUB(
"hexagon-tiny-loop-align-limit-ub", cl::Hidden, cl::init(16),
cl::desc("Set hexagon tiny-core loop upper bound align limit"));

static cl::opt<uint32_t>
LoopAlignLimitUB("hexagon-loop-align-limit-ub", cl::Hidden, cl::init(8),
cl::desc("Set hexagon loop upper bound align limit"));

static cl::opt<uint32_t>
LoopAlignLimitLB("hexagon-loop-align-limit-lb", cl::Hidden, cl::init(4),
cl::desc("Set hexagon loop lower bound align limit"));

static cl::opt<uint32_t>
LoopBndlAlignLimit("hexagon-loop-bundle-align-limit", cl::Hidden,
cl::init(4),
cl::desc("Set hexagon loop align bundle limit"));

static cl::opt<uint32_t> TinyLoopBndlAlignLimit(
"hexagon-tiny-loop-bundle-align-limit", cl::Hidden, cl::init(8),
cl::desc("Set hexagon tiny-core loop align bundle limit"));

static cl::opt<uint32_t>
LoopEdgeThreshold("hexagon-loop-edge-threshold", cl::Hidden, cl::init(7500),
cl::desc("Set hexagon loop align edge theshold"));

namespace llvm {
FunctionPass *createHexagonLoopAlign();
void initializeHexagonLoopAlignPass(PassRegistry &);
} // namespace llvm

namespace {

class HexagonLoopAlign : public MachineFunctionPass {
const HexagonSubtarget *HST = nullptr;
const TargetMachine *HTM = nullptr;
const HexagonInstrInfo *HII = nullptr;

public:
static char ID;
HexagonLoopAlign() : MachineFunctionPass(ID) {
initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry());
}
bool shouldBalignLoop(MachineBasicBlock &BB, bool AboveThres);
bool isSingleLoop(MachineBasicBlock &MBB);
bool attemptToBalignSmallLoop(MachineFunction &MF, MachineBasicBlock &MBB);

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineBranchProbabilityInfo>();
AU.addRequired<MachineBlockFrequencyInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}

StringRef getPassName() const override { return "Hexagon LoopAlign pass"; }
bool runOnMachineFunction(MachineFunction &MF) override;
};

char HexagonLoopAlign::ID = 0;

bool HexagonLoopAlign::shouldBalignLoop(MachineBasicBlock &BB,
bool AboveThres) {
bool isVec = false;
unsigned InstCnt = 0;
unsigned BndlCnt = 0;

for (MachineBasicBlock::instr_iterator II = BB.instr_begin(),
IE = BB.instr_end();
II != IE; ++II) {

// End if the instruction is endloop.
if (HII->isEndLoopN(II->getOpcode()))
break;
// Count the number of bundles.
if (II->isBundle()) {
BndlCnt++;
continue;
}
// Skip over debug instructions.
if (II->isDebugInstr())
continue;
// Check if there are any HVX instructions in loop.
isVec |= HII->isHVXVec(*II);
// Count the number of instructions.
InstCnt++;
}

LLVM_DEBUG({
dbgs() << "Bundle Count : " << BndlCnt << "\n";
dbgs() << "Instruction Count : " << InstCnt << "\n";
});

unsigned LimitUB = 0;
unsigned LimitBndl = LoopBndlAlignLimit;
// The conditions in the order of priority.
if (HST->isTinyCore()) {
LimitUB = TinyLoopAlignLimitUB;
LimitBndl = TinyLoopBndlAlignLimit;
} else if (isVec)
LimitUB = HVXLoopAlignLimitUB;
else if (AboveThres)
LimitUB = LoopAlignLimitUB;

// if the upper bound is not set to a value, implies we didn't meet
// the criteria.
if (LimitUB == 0)
return false;

return InstCnt >= LoopAlignLimitLB && InstCnt <= LimitUB &&
BndlCnt <= LimitBndl;
}

bool HexagonLoopAlign::isSingleLoop(MachineBasicBlock &MBB) {
int Succs = MBB.succ_size();
return (MBB.isSuccessor(&MBB) && (Succs == 2));
}

bool HexagonLoopAlign::attemptToBalignSmallLoop(MachineFunction &MF,
MachineBasicBlock &MBB) {
if (!isSingleLoop(MBB))
return false;

const MachineBranchProbabilityInfo *MBPI =
&getAnalysis<MachineBranchProbabilityInfo>();
const MachineBlockFrequencyInfo *MBFI =
&getAnalysis<MachineBlockFrequencyInfo>();

// Compute frequency of back edge,
BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
BranchProbability BrProb = MBPI->getEdgeProbability(&MBB, &MBB);
BlockFrequency EdgeFreq = BlockFreq * BrProb;
LLVM_DEBUG({
dbgs() << "Loop Align Pass:\n";
dbgs() << "\tedge with freq(" << EdgeFreq.getFrequency() << ")\n";
});

bool AboveThres = EdgeFreq.getFrequency() > LoopEdgeThreshold;
if (shouldBalignLoop(MBB, AboveThres)) {
// We found a loop, change its alignment to be 32 (5).
MBB.setAlignment(llvm::Align(1 << 5));
return true;
}
return false;
}

// Inspect each basic block, and if its a single BB loop, see if it
// meets the criteria for increasing alignment to 32.

bool HexagonLoopAlign::runOnMachineFunction(MachineFunction &MF) {

HST = &MF.getSubtarget<HexagonSubtarget>();
HII = HST->getInstrInfo();
HTM = &MF.getTarget();

if (skipFunction(MF.getFunction()))
return false;
if (DisableLoopAlign)
return false;

// This optimization is performed at
// i) -O2 and above, and when the loop has a HVX instruction.
// ii) -O3
if (HST->useHVXOps()) {
if (HTM->getOptLevel() < CodeGenOptLevel::Default)
return false;
} else {
if (HTM->getOptLevel() < CodeGenOptLevel::Aggressive)
return false;
}

bool Changed = false;
for (MachineFunction::iterator MBBi = MF.begin(), MBBe = MF.end();
MBBi != MBBe; ++MBBi) {
MachineBasicBlock &MBB = *MBBi;
Changed |= attemptToBalignSmallLoop(MF, MBB);
}
return Changed;
}

} // namespace

INITIALIZE_PASS(HexagonLoopAlign, "hexagon-loop-align",
"Hexagon LoopAlign pass", false, false)

//===----------------------------------------------------------------------===//
// Public Constructor Functions
//===----------------------------------------------------------------------===//

FunctionPass *llvm::createHexagonLoopAlign() { return new HexagonLoopAlign(); }
9 changes: 8 additions & 1 deletion llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ namespace llvm {
void initializeHexagonGenMuxPass(PassRegistry&);
void initializeHexagonHardwareLoopsPass(PassRegistry&);
void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &);
void initializeHexagonLoopAlignPass(PassRegistry &);
void initializeHexagonNewValueJumpPass(PassRegistry&);
void initializeHexagonOptAddrModePass(PassRegistry&);
void initializeHexagonPacketizerPass(PassRegistry&);
Expand Down Expand Up @@ -194,6 +195,7 @@ namespace llvm {
FunctionPass *createHexagonHardwareLoops();
FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
CodeGenOptLevel OptLevel);
FunctionPass *createHexagonLoopAlign();
FunctionPass *createHexagonLoopRescheduling();
FunctionPass *createHexagonNewValueJump();
FunctionPass *createHexagonOptAddrMode();
Expand Down Expand Up @@ -256,8 +258,10 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
TT, CPU, FS, Options, getEffectiveRelocModel(RM),
getEffectiveCodeModel(CM, CodeModel::Small),
(HexagonNoOpt ? CodeGenOptLevel::None : OL)),
TLOF(std::make_unique<HexagonTargetObjectFile>()) {
TLOF(std::make_unique<HexagonTargetObjectFile>()),
Subtarget(Triple(TT), CPU, FS, *this) {
initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
initializeHexagonLoopAlignPass(*PassRegistry::getPassRegistry());
initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
initAsmInfo();
}
Expand Down Expand Up @@ -476,6 +480,9 @@ void HexagonPassConfig::addPreEmitPass() {
// Packetization is mandatory: it handles gather/scatter at all opt levels.
addPass(createHexagonPacketizer(NoOpt));

if (!NoOpt)
addPass(createHexagonLoopAlign());

if (EnableVectorPrint)
addPass(createHexagonVectorPrint());

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/Hexagon/HexagonTargetMachine.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ namespace llvm {

class HexagonTargetMachine : public LLVMTargetMachine {
std::unique_ptr<TargetLoweringObjectFile> TLOF;
HexagonSubtarget Subtarget;
mutable StringMap<std::unique_ptr<HexagonSubtarget>> SubtargetMap;

public:
Expand Down
86 changes: 39 additions & 47 deletions llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -983,27 +983,22 @@ using DirectEdges = SmallVector<PGOUseEdge *, 2>;

// This class stores the auxiliary information for each BB.
struct PGOUseBBInfo : public PGOBBInfo {
uint64_t CountValue = 0;
bool CountValid;
std::optional<uint64_t> Count;
int32_t UnknownCountInEdge = 0;
int32_t UnknownCountOutEdge = 0;
DirectEdges InEdges;
DirectEdges OutEdges;

PGOUseBBInfo(unsigned IX) : PGOBBInfo(IX), CountValid(false) {}
PGOUseBBInfo(unsigned IX) : PGOBBInfo(IX) {}

// Set the profile count value for this BB.
void setBBInfoCount(uint64_t Value) {
CountValue = Value;
CountValid = true;
}
void setBBInfoCount(uint64_t Value) { Count = Value; }

// Return the information string of this object.
std::string infoString() const {
if (!CountValid)
if (!Count)
return PGOBBInfo::infoString();
return (Twine(PGOBBInfo::infoString()) + " Count=" + Twine(CountValue))
.str();
return (Twine(PGOBBInfo::infoString()) + " Count=" + Twine(*Count)).str();
}

// Add an OutEdge and update the edge count.
Expand Down Expand Up @@ -1216,15 +1211,15 @@ bool PGOUseFunc::setInstrumentedCounts(

// If only one out-edge, the edge profile count should be the same as BB
// profile count.
if (SrcInfo.CountValid && SrcInfo.OutEdges.size() == 1)
setEdgeCount(E.get(), SrcInfo.CountValue);
if (SrcInfo.Count && SrcInfo.OutEdges.size() == 1)
setEdgeCount(E.get(), *SrcInfo.Count);
else {
const BasicBlock *DestBB = E->DestBB;
PGOUseBBInfo &DestInfo = getBBInfo(DestBB);
// If only one in-edge, the edge profile count should be the same as BB
// profile count.
if (DestInfo.CountValid && DestInfo.InEdges.size() == 1)
setEdgeCount(E.get(), DestInfo.CountValue);
if (DestInfo.Count && DestInfo.InEdges.size() == 1)
setEdgeCount(E.get(), *DestInfo.Count);
}
if (E->CountValid)
continue;
Expand Down Expand Up @@ -1481,38 +1476,36 @@ void PGOUseFunc::populateCounters() {
// For efficient traversal, it's better to start from the end as most
// of the instrumented edges are at the end.
for (auto &BB : reverse(F)) {
PGOUseBBInfo *Count = findBBInfo(&BB);
if (Count == nullptr)
PGOUseBBInfo *UseBBInfo = findBBInfo(&BB);
if (UseBBInfo == nullptr)
continue;
if (!Count->CountValid) {
if (Count->UnknownCountOutEdge == 0) {
Count->CountValue = sumEdgeCount(Count->OutEdges);
Count->CountValid = true;
if (!UseBBInfo->Count) {
if (UseBBInfo->UnknownCountOutEdge == 0) {
UseBBInfo->Count = sumEdgeCount(UseBBInfo->OutEdges);
Changes = true;
} else if (Count->UnknownCountInEdge == 0) {
Count->CountValue = sumEdgeCount(Count->InEdges);
Count->CountValid = true;
} else if (UseBBInfo->UnknownCountInEdge == 0) {
UseBBInfo->Count = sumEdgeCount(UseBBInfo->InEdges);
Changes = true;
}
}
if (Count->CountValid) {
if (Count->UnknownCountOutEdge == 1) {
if (UseBBInfo->Count) {
if (UseBBInfo->UnknownCountOutEdge == 1) {
uint64_t Total = 0;
uint64_t OutSum = sumEdgeCount(Count->OutEdges);
uint64_t OutSum = sumEdgeCount(UseBBInfo->OutEdges);
// If the one of the successor block can early terminate (no-return),
// we can end up with situation where out edge sum count is larger as
// the source BB's count is collected by a post-dominated block.
if (Count->CountValue > OutSum)
Total = Count->CountValue - OutSum;
setEdgeCount(Count->OutEdges, Total);
if (*UseBBInfo->Count > OutSum)
Total = *UseBBInfo->Count - OutSum;
setEdgeCount(UseBBInfo->OutEdges, Total);
Changes = true;
}
if (Count->UnknownCountInEdge == 1) {
if (UseBBInfo->UnknownCountInEdge == 1) {
uint64_t Total = 0;
uint64_t InSum = sumEdgeCount(Count->InEdges);
if (Count->CountValue > InSum)
Total = Count->CountValue - InSum;
setEdgeCount(Count->InEdges, Total);
uint64_t InSum = sumEdgeCount(UseBBInfo->InEdges);
if (*UseBBInfo->Count > InSum)
Total = *UseBBInfo->Count - InSum;
setEdgeCount(UseBBInfo->InEdges, Total);
Changes = true;
}
}
Expand All @@ -1527,16 +1520,16 @@ void PGOUseFunc::populateCounters() {
auto BI = findBBInfo(&BB);
if (BI == nullptr)
continue;
assert(BI->CountValid && "BB count is not valid");
assert(BI->Count && "BB count is not valid");
}
#endif
uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
uint64_t FuncEntryCount = *getBBInfo(&*F.begin()).Count;
uint64_t FuncMaxCount = FuncEntryCount;
for (auto &BB : F) {
auto BI = findBBInfo(&BB);
if (BI == nullptr)
continue;
FuncMaxCount = std::max(FuncMaxCount, BI->CountValue);
FuncMaxCount = std::max(FuncMaxCount, *BI->Count);
}

// Fix the obviously inconsistent entry count.
Expand Down Expand Up @@ -1566,11 +1559,11 @@ void PGOUseFunc::setBranchWeights() {
isa<CallBrInst>(TI)))
continue;

if (getBBInfo(&BB).CountValue == 0)
const PGOUseBBInfo &BBCountInfo = getBBInfo(&BB);
if (!*BBCountInfo.Count)
continue;

// We have a non-zero Branch BB.
const PGOUseBBInfo &BBCountInfo = getBBInfo(&BB);
unsigned Size = BBCountInfo.OutEdges.size();
SmallVector<uint64_t, 2> EdgeCounts(Size, 0);
uint64_t MaxCount = 0;
Expand Down Expand Up @@ -1622,7 +1615,7 @@ void PGOUseFunc::annotateIrrLoopHeaderWeights() {
if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) {
Instruction *TI = BB.getTerminator();
const PGOUseBBInfo &BBCountInfo = getBBInfo(&BB);
setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
setIrrLoopHeaderMetadata(M, TI, *BBCountInfo.Count);
}
}
}
Expand All @@ -1649,7 +1642,7 @@ void SelectInstVisitor::annotateOneSelectInst(SelectInst &SI) {
uint64_t TotalCount = 0;
auto BI = UseFunc->findBBInfo(SI.getParent());
if (BI != nullptr)
TotalCount = BI->CountValue;
TotalCount = *BI->Count;
// False Count
SCounts[1] = (TotalCount > SCounts[0] ? TotalCount - SCounts[0] : 0);
uint64_t MaxCount = std::max(SCounts[0], SCounts[1]);
Expand Down Expand Up @@ -1850,7 +1843,7 @@ static void fixFuncEntryCount(PGOUseFunc &Func, LoopInfo &LI,
if (!Func.findBBInfo(&BBI))
continue;
auto BFICount = NBFI.getBlockProfileCount(&BBI);
CountValue = Func.getBBInfo(&BBI).CountValue;
CountValue = *Func.getBBInfo(&BBI).Count;
BFICountValue = *BFICount;
SumCount.add(APFloat(CountValue * 1.0), APFloat::rmNearestTiesToEven);
SumBFICount.add(APFloat(BFICountValue * 1.0), APFloat::rmNearestTiesToEven);
Expand All @@ -1866,7 +1859,7 @@ static void fixFuncEntryCount(PGOUseFunc &Func, LoopInfo &LI,
if (Scale < 1.001 && Scale > 0.999)
return;

uint64_t FuncEntryCount = Func.getBBInfo(&*F.begin()).CountValue;
uint64_t FuncEntryCount = *Func.getBBInfo(&*F.begin()).Count;
uint64_t NewEntryCount = 0.5 + FuncEntryCount * Scale;
if (NewEntryCount == 0)
NewEntryCount = 1;
Expand Down Expand Up @@ -1896,8 +1889,7 @@ static void verifyFuncBFI(PGOUseFunc &Func, LoopInfo &LI,
uint64_t CountValue = 0;
uint64_t BFICountValue = 0;

if (Func.getBBInfo(&BBI).CountValid)
CountValue = Func.getBBInfo(&BBI).CountValue;
CountValue = Func.getBBInfo(&BBI).Count.value_or(CountValue);

BBNum++;
if (CountValue)
Expand Down Expand Up @@ -2279,8 +2271,8 @@ template <> struct DOTGraphTraits<PGOUseFunc *> : DefaultDOTGraphTraits {
OS << getSimpleNodeName(Node) << ":\\l";
PGOUseBBInfo *BI = Graph->findBBInfo(Node);
OS << "Count : ";
if (BI && BI->CountValid)
OS << BI->CountValue << "\\l";
if (BI && BI->Count)
OS << *BI->Count << "\\l";
else
OS << "Unknown\\l";

Expand Down
9 changes: 4 additions & 5 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4100,16 +4100,15 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
// increases the cost.
Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());
bool ProfitableGatherPointers =
static_cast<unsigned>(count_if(
PointerOps,
[L](Value *V) { return L && L->isLoopInvariant(V); })) >= Sz / 2 &&
L &&
count_if(PointerOps, [L](Value *V) { return L->isLoopInvariant(V); }) <=
Sz / 2 &&
Sz > 2;
if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
auto *GEP = dyn_cast<GetElementPtrInst>(P);
return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
(GEP && GEP->getNumOperands() == 2 &&
(isa<Constant>(GEP->getOperand(1)) ||
isa<Instruction>(GEP->getOperand(1))));
isa<Constant, Instruction>(GEP->getOperand(1)));
})) {
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
Expand Down
27 changes: 13 additions & 14 deletions llvm/test/CodeGen/AMDGPU/ctlz.ll
Original file line number Diff line number Diff line change
Expand Up @@ -492,9 +492,9 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
Expand All @@ -512,17 +512,17 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_subrev_u32_e32 v0, vcc, 24, v0
; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctlz_i8:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
Expand All @@ -531,15 +531,14 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: FFBH_UINT T1.W, PV.W,
; EG-NEXT: AND_INT * T2.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: FFBH_UINT * T0.W, T0.X,
; EG-NEXT: CNDE_INT T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 32(4.484155e-44), 3(4.203895e-45)
; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
; EG-NEXT: -24(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, T2.W, literal.y,
; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
Expand All @@ -556,9 +555,9 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; GFX10-NEXT: v_ffbh_u32_e32 v1, v1
; GFX10-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v1
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
Expand All @@ -582,10 +581,10 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u8 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_u32_e32 v1, 32, v1
; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v1
; GFX11-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
77 changes: 39 additions & 38 deletions llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s2, s2, 24
; SI-NEXT: s_flbit_i32_b32 s4, s2
; SI-NEXT: s_and_b32 s2, s2, 0xff
; SI-NEXT: s_flbit_i32_b32 s2, s2
; SI-NEXT: s_sub_i32 s4, s2, 24
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
Expand All @@ -326,8 +327,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s2, s2, 24
; VI-NEXT: s_and_b32 s2, s2, 0xff
; VI-NEXT: s_flbit_i32_b32 s2, s2
; VI-NEXT: s_sub_i32 s2, s2, 24
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
Expand All @@ -347,13 +349,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: FFBH_UINT T0.W, PV.W,
; EG-NEXT: FFBH_UINT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
; EG-NEXT: -24(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
Expand Down Expand Up @@ -389,8 +391,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s2, s2, 16
; SI-NEXT: s_flbit_i32_b32 s4, s2
; SI-NEXT: s_and_b32 s2, s2, 0xffff
; SI-NEXT: s_flbit_i32_b32 s2, s2
; SI-NEXT: s_add_i32 s4, s2, -16
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
Expand Down Expand Up @@ -423,13 +426,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: FFBH_UINT T0.W, PV.W,
; EG-NEXT: FFBH_UINT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
; EG-NEXT: -16(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
Expand Down Expand Up @@ -587,8 +590,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
; SI-NEXT: v_ffbh_u32_e32 v1, v1
; SI-NEXT: v_ffbh_u32_e32 v1, v0
; SI-NEXT: v_subrev_i32_e32 v1, vcc, 24, v1
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
Expand All @@ -602,8 +605,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
; VI-NEXT: v_ffbh_u32_e32 v1, v1
; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; VI-NEXT: v_subrev_u32_e32 v1, vcc, 24, v1
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -615,7 +618,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
Expand All @@ -624,11 +627,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: FFBH_UINT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: FFBH_UINT * T0.W, T0.X,
; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: -24(nan), 3(4.203895e-45)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
Expand Down Expand Up @@ -683,8 +685,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; SI-NEXT: v_ffbh_u32_e32 v1, v1
; SI-NEXT: v_ffbh_u32_e32 v1, v0
; SI-NEXT: v_add_i32_e32 v1, vcc, -16, v1
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
Expand Down Expand Up @@ -719,7 +721,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
Expand All @@ -728,11 +730,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: FFBH_UINT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: FFBH_UINT * T0.W, T0.X,
; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: -16(nan), 3(4.203895e-45)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
Expand Down Expand Up @@ -1101,8 +1102,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
Expand All @@ -1115,8 +1116,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; VI-NEXT: v_ffbh_u32_e32 v2, v0
; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_byte v[0:1], v2
Expand All @@ -1135,13 +1136,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
; EG-NEXT: FFBH_UINT T0.W, PV.W,
; EG-NEXT: FFBH_UINT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
; EG-NEXT: -24(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
Expand Down
91 changes: 91 additions & 0 deletions llvm/test/CodeGen/Hexagon/loop-balign.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
; RUN: llc -march=hexagon -O3 < %s | FileCheck %s -check-prefix=BALIGN
; BALIGN: .p2align{{.*}}5

; The test for checking the alignment of 'for.body4.for.body4_crit_edge' basic block

define dso_local void @foo(i32 %nCol, i32 %nRow, ptr nocapture %resMat) local_unnamed_addr {
entry:
%shl = shl i32 %nRow, 2
%cmp36 = icmp sgt i32 %nRow, 0
%0 = add i32 %nCol, -1
%.inv = icmp slt i32 %0, 1
%1 = select i1 %.inv, i32 1, i32 %nCol
br label %Outerloop

Outerloop: ; preds = %for.end7, %entry
%r12.0 = phi i32 [ 0, %entry ], [ %inc8, %for.end7 ]
%r7_6.0 = phi i64 [ undef, %entry ], [ %r7_6.1.lcssa, %for.end7 ]
%r0i.0 = phi i32 [ undef, %entry ], [ %r0i.1.lcssa, %for.end7 ]
%r5.0 = phi ptr [ %resMat, %entry ], [ %r5.1.lcssa, %for.end7 ]
%r8.0 = phi i32 [ %shl, %entry ], [ %r8.1.lcssa, %for.end7 ]
br i1 %cmp36, label %for.body.lr.ph, label %for.end7

for.body.lr.ph: ; preds = %Outerloop
%cmp332 = icmp eq i32 %r12.0, 0
%exitcond.peel = icmp eq i32 %r12.0, 1
br label %for.body

for.body: ; preds = %for.end, %for.body.lr.ph
%r8.141 = phi i32 [ %r8.0, %for.body.lr.ph ], [ %add, %for.end ]
%r5.140 = phi ptr [ %r5.0, %for.body.lr.ph ], [ %add.ptr, %for.end ]
%i.039 = phi i32 [ 0, %for.body.lr.ph ], [ %inc6, %for.end ]
%r0i.138 = phi i32 [ %r0i.0, %for.body.lr.ph ], [ %4, %for.end ]
%r7_6.137 = phi i64 [ %r7_6.0, %for.body.lr.ph ], [ %r7_6.2.lcssa, %for.end ]
%add = add nsw i32 %r8.141, %shl
br i1 %cmp332, label %for.end, label %for.body4.peel

for.body4.peel: ; preds = %for.body
%r1i.0.in.peel = inttoptr i32 %r8.141 to ptr
%r1i.0.peel = load i32, ptr %r1i.0.in.peel, align 4
%2 = tail call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %r7_6.137, i32 %r1i.0.peel, i32 %r0i.138)
br i1 %exitcond.peel, label %for.end, label %for.body4.preheader.peel.newph

for.body4.preheader.peel.newph: ; preds = %for.body4.peel
%r1i.0.in = inttoptr i32 %add to ptr
%r1i.0 = load i32, ptr %r1i.0.in, align 4
br label %for.body4

for.body4: ; preds = %for.body4.for.body4_crit_edge, %for.body4.preheader.peel.newph
%inc.phi = phi i32 [ %inc.0, %for.body4.for.body4_crit_edge ], [ 2, %for.body4.preheader.peel.newph ]
%r7_6.233 = phi i64 [ %3, %for.body4.for.body4_crit_edge ], [ %2, %for.body4.preheader.peel.newph ]
%3 = tail call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %r7_6.233, i32 %r1i.0, i32 %r0i.138)
%exitcond = icmp eq i32 %inc.phi, %r12.0
br i1 %exitcond, label %for.end.loopexit, label %for.body4.for.body4_crit_edge

for.body4.for.body4_crit_edge: ; preds = %for.body4
%inc.0 = add nuw nsw i32 %inc.phi, 1
br label %for.body4

for.end.loopexit: ; preds = %for.body4
br label %for.end

for.end: ; preds = %for.end.loopexit, %for.body4.peel, %for.body
%r7_6.2.lcssa = phi i64 [ %r7_6.137, %for.body ], [ %2, %for.body4.peel ], [ %3, %for.end.loopexit ]
%4 = tail call i32 @llvm.hexagon.S2.clbp(i64 %r7_6.2.lcssa)
store i32 %4, ptr %r5.140, align 4
%add.ptr = getelementptr inbounds i8, ptr %r5.140, i32 undef
%inc6 = add nuw nsw i32 %i.039, 1
%exitcond47 = icmp eq i32 %inc6, %nRow
br i1 %exitcond47, label %for.end7.loopexit, label %for.body

for.end7.loopexit: ; preds = %for.end
br label %for.end7

for.end7: ; preds = %for.end7.loopexit, %Outerloop
%r7_6.1.lcssa = phi i64 [ %r7_6.0, %Outerloop ], [ %r7_6.2.lcssa, %for.end7.loopexit ]
%r0i.1.lcssa = phi i32 [ %r0i.0, %Outerloop ], [ %4, %for.end7.loopexit ]
%r5.1.lcssa = phi ptr [ %r5.0, %Outerloop ], [ %add.ptr, %for.end7.loopexit ]
%r8.1.lcssa = phi i32 [ %r8.0, %Outerloop ], [ %add, %for.end7.loopexit ]
%inc8 = add nuw i32 %r12.0, 1
%exitcond48 = icmp eq i32 %inc8, %1
br i1 %exitcond48, label %if.end, label %Outerloop

if.end: ; preds = %for.end7
ret void
}

; Function Attrs: nounwind readnone
declare i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64, i32, i32)

; Function Attrs: nounwind readnone
declare i32 @llvm.hexagon.S2.clbp(i64)
115 changes: 115 additions & 0 deletions llvm/test/CodeGen/Hexagon/loop_align_count.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
; RUN: llc -march=hexagon -mcpu=hexagonv73 -O2 -mattr=+hvxv73,hvx-length64b \
; RUN: -debug-only=hexagon-loop-align 2>&1 < %s | FileCheck %s
; Validate that there are 4 bundles in the loop.

; CHECK: Loop Align Pass:
; CHECK: Bundle Count : 4
; CHECK: .p2align{{.*}}5

; Function Attrs: nounwind
define void @ham(ptr noalias nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3, ptr noalias nocapture %arg4, i32 %arg5) #0 {
bb:
%ashr = ashr i32 %arg3, 2
%ashr6 = ashr i32 %arg3, 1
%add = add nsw i32 %ashr6, %ashr
%icmp = icmp sgt i32 %arg2, 0
br i1 %icmp, label %bb7, label %bb61

bb7: ; preds = %bb
%sdiv = sdiv i32 %arg1, 64
%icmp8 = icmp sgt i32 %arg1, 63
br label %bb9

bb9: ; preds = %bb57, %bb7
%phi = phi i32 [ 0, %bb7 ], [ %add58, %bb57 ]
%ashr10 = ashr exact i32 %phi, 1
%mul = mul nsw i32 %ashr10, %arg3
br i1 %icmp8, label %bb11, label %bb57

bb11: ; preds = %bb9
%add12 = add nsw i32 %phi, 1
%mul13 = mul nsw i32 %add12, %arg5
%mul14 = mul nsw i32 %phi, %arg5
%add15 = add i32 %add, %mul
%add16 = add i32 %mul, %ashr
%add17 = add i32 %mul, %ashr6
%getelementptr = getelementptr inbounds i8, ptr %arg4, i32 %mul13
%getelementptr18 = getelementptr inbounds i8, ptr %arg4, i32 %mul14
%getelementptr19 = getelementptr inbounds i16, ptr %arg, i32 %add15
%getelementptr20 = getelementptr inbounds i16, ptr %arg, i32 %add16
%getelementptr21 = getelementptr inbounds i16, ptr %arg, i32 %add17
%getelementptr22 = getelementptr inbounds i16, ptr %arg, i32 %mul
%bitcast = bitcast ptr %getelementptr to ptr
%bitcast23 = bitcast ptr %getelementptr18 to ptr
%bitcast24 = bitcast ptr %getelementptr19 to ptr
%bitcast25 = bitcast ptr %getelementptr20 to ptr
%bitcast26 = bitcast ptr %getelementptr21 to ptr
%bitcast27 = bitcast ptr %getelementptr22 to ptr
br label %bb28

bb28: ; preds = %bb28, %bb11
%phi29 = phi i32 [ 0, %bb11 ], [ %add54, %bb28 ]
%phi30 = phi ptr [ %bitcast27, %bb11 ], [ %getelementptr36, %bb28 ]
%phi31 = phi ptr [ %bitcast26, %bb11 ], [ %getelementptr37, %bb28 ]
%phi32 = phi ptr [ %bitcast25, %bb11 ], [ %getelementptr39, %bb28 ]
%phi33 = phi ptr [ %bitcast24, %bb11 ], [ %getelementptr41, %bb28 ]
%phi34 = phi ptr [ %bitcast, %bb11 ], [ %getelementptr53, %bb28 ]
%phi35 = phi ptr [ %bitcast23, %bb11 ], [ %getelementptr52, %bb28 ]
%getelementptr36 = getelementptr inbounds <16 x i32>, ptr %phi30, i32 1
%load = load <16 x i32>, ptr %phi30, align 64
%getelementptr37 = getelementptr inbounds <16 x i32>, ptr %phi31, i32 1
%load38 = load <16 x i32>, ptr %phi31, align 64
%getelementptr39 = getelementptr inbounds <16 x i32>, ptr %phi32, i32 1
%load40 = load <16 x i32>, ptr %phi32, align 64
%getelementptr41 = getelementptr inbounds <16 x i32>, ptr %phi33, i32 1
%load42 = load <16 x i32>, ptr %phi33, align 64
%call = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load, <16 x i32> %load38)
%call43 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load, <16 x i32> %load38)
%call44 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load40, <16 x i32> %load42)
%call45 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load40, <16 x i32> %load42)
%call46 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call, <16 x i32> %call44)
%call47 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call, <16 x i32> %call44)
%call48 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call43, <16 x i32> %call45)
%call49 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call43, <16 x i32> %call45)
%call50 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call47, <16 x i32> %call46)
%call51 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call49, <16 x i32> %call48)
%getelementptr52 = getelementptr inbounds <16 x i32>, ptr %phi35, i32 1
store <16 x i32> %call50, ptr %phi35, align 64
%getelementptr53 = getelementptr inbounds <16 x i32>, ptr %phi34, i32 1
store <16 x i32> %call51, ptr %phi34, align 64
%add54 = add nsw i32 %phi29, 1
%icmp55 = icmp slt i32 %add54, %sdiv
br i1 %icmp55, label %bb28, label %bb56

bb56: ; preds = %bb28
br label %bb57

bb57: ; preds = %bb56, %bb9
%add58 = add nsw i32 %phi, 2
%icmp59 = icmp slt i32 %add58, %arg2
br i1 %icmp59, label %bb9, label %bb60

bb60: ; preds = %bb57
br label %bb61

bb61: ; preds = %bb60, %bb
ret void
}

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32>, <16 x i32>) #1

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32>, <16 x i32>) #1

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32>, <16 x i32>) #1

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1

attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
130 changes: 130 additions & 0 deletions llvm/test/CodeGen/Hexagon/loop_align_count.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# RUN: llc -march=hexagon -O3 -run-pass hexagon-loop-align -o - %s\
# RUN: -debug-only=hexagon-loop-align -verify-machineinstrs 2>&1 | FileCheck %s

# Test that we only count til endloop instruction and we align this
# loop to 32.
# CHECK: Loop Align Pass:
# CHECK: Instruction Count : 16
# CHECK: bb.5 (align 32)
---
name: fred
tracksRegLiveness: true

body: |
bb.0:
successors: %bb.1(0x50000000), %bb.8(0x30000000)
liveins: $r0, $r1, $r2, $r3, $r4, $r5
renamable $p0 = C2_cmpgti renamable $r2, 0
J2_jumpf killed renamable $p0, %bb.8, implicit-def dead $pc
J2_jump %bb.1, implicit-def dead $pc
bb.1:
successors: %bb.2(0x80000000)
liveins: $r0, $r1, $r2, $r3, $r4, $r5
renamable $r7 = A2_addi killed renamable $r2, 1
renamable $r8 = S2_asr_i_r renamable $r1, 31
renamable $p0 = C2_cmpgti renamable $r1, 63
renamable $r2 = S2_asr_i_r renamable $r3, 2
renamable $r6 = S2_asr_i_r renamable $r3, 1
renamable $r9 = S2_lsr_i_r killed renamable $r7, 1
renamable $r1 = S2_lsr_i_r_acc killed renamable $r1, killed renamable $r8, 26
renamable $r7 = A2_tfrsi 0
renamable $r1 = S2_asr_i_r killed renamable $r1, 6
J2_loop1r %bb.2, killed renamable $r9, implicit-def $lc1, implicit-def $sa1
renamable $r8 = nsw A2_add renamable $r6, renamable $r2
bb.2:
successors: %bb.3(0x40000000), %bb.7(0x40000000)
liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8
J2_jumpf renamable $p0, %bb.7, implicit-def dead $pc
J2_jump %bb.3, implicit-def dead $pc
bb.3:
successors: %bb.4(0x80000000)
liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8
renamable $r13 = exact S2_asr_i_r renamable $r7, 1
renamable $r12 = COPY renamable $r4
renamable $r9 = COPY renamable $r4
renamable $r14 = nsw A2_addi renamable $r7, 1
renamable $r15 = nsw M2_mpyi killed renamable $r13, renamable $r3
renamable $r9 = M2_maci killed renamable $r9, killed renamable $r14, renamable $r5
renamable $r13 = A2_add renamable $r8, renamable $r15
renamable $r28 = A2_add renamable $r15, renamable $r2
renamable $r10 = A2_add renamable $r15, renamable $r6
renamable $r12 = M2_maci killed renamable $r12, renamable $r7, renamable $r5
renamable $r13 = S2_addasl_rrri renamable $r0, killed renamable $r13, 1
renamable $r14 = S2_addasl_rrri renamable $r0, killed renamable $r15, 1
renamable $r15 = S2_addasl_rrri renamable $r0, killed renamable $r28, 1
renamable $r28 = S2_addasl_rrri renamable $r0, killed renamable $r10, 1
bb.4:
successors: %bb.5(0x40000000), %bb.6(0x40000000)
liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r13, $r14, $r15, $r28
renamable $v0, renamable $r14 = V6_vL32b_pi killed renamable $r14, 64
renamable $p1 = C2_cmpgtui renamable $r1, 1
renamable $r10 = A2_addi renamable $r1, -1
renamable $v2, renamable $r28 = V6_vL32b_pi killed renamable $r28, 64
renamable $v1 = V6_vaddh renamable $v0, renamable $v2
renamable $v3, renamable $r15 = V6_vL32b_pi killed renamable $r15, 64
renamable $v0 = V6_vsubh killed renamable $v0, killed renamable $v2
J2_loop0r %bb.5, killed renamable $r10, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
renamable $v4, renamable $r13 = V6_vL32b_pi killed renamable $r13, 64
renamable $v2 = V6_vaddh renamable $v3, renamable $v4
J2_jumpf killed renamable $p1, %bb.6, implicit-def $pc
J2_jump %bb.5, implicit-def $pc
bb.5:
successors: %bb.5(0x7c000000), %bb.6(0x04000000)
liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r13, $r14, $r15, $r28, $v0, $v1, $v2, $v3, $v4
renamable $v3 = V6_vsubh killed renamable $v3, killed renamable $v4
renamable $v4, renamable $r14 = V6_vL32b_pi killed renamable $r14, 64
renamable $v5 = V6_vnavgh renamable $v1, renamable $v2
renamable $v1 = V6_vavgh killed renamable $v1, killed renamable $v2
renamable $v2, renamable $r28 = V6_vL32b_pi killed renamable $r28, 64
renamable $v1 = V6_vsathub killed renamable $v5, killed renamable $v1
renamable $v5 = V6_vnavgh renamable $v0, renamable $v3
renamable $v6 = V6_vavgh killed renamable $v0, killed renamable $v3
renamable $r12 = V6_vS32b_pi killed renamable $r12, 64, killed renamable $v1
renamable $v1 = V6_vaddh renamable $v4, renamable $v2
renamable $v3, renamable $r15 = V6_vL32b_pi killed renamable $r15, 64
renamable $v0 = V6_vsubh killed renamable $v4, killed renamable $v2
renamable $v4, renamable $r13 = V6_vL32b_pi killed renamable $r13, 64
renamable $v2 = V6_vaddh renamable $v3, renamable $v4
renamable $v5 = V6_vsathub killed renamable $v5, killed renamable $v6
renamable $r9 = V6_vS32b_pi killed renamable $r9, 64, killed renamable $v5
ENDLOOP0 %bb.5, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
J2_jump %bb.6, implicit-def $pc
bb.6:
successors: %bb.7(0x80000000)
liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $v0, $v1, $v2, $v3, $v4
renamable $v3 = V6_vsubh killed renamable $v3, killed renamable $v4
renamable $v4 = V6_vavgh renamable $v1, renamable $v2
renamable $v1 = V6_vnavgh killed renamable $v1, killed renamable $v2
renamable $v2 = V6_vavgh renamable $v0, renamable $v3
renamable $v0 = V6_vnavgh killed renamable $v0, killed renamable $v3
renamable $v1 = V6_vsathub killed renamable $v1, killed renamable $v4
dead renamable $r12 = V6_vS32b_pi killed renamable $r12, 64, killed renamable $v1
renamable $v0 = V6_vsathub killed renamable $v0, killed renamable $v2
dead renamable $r9 = V6_vS32b_pi killed renamable $r9, 64, killed renamable $v0
J2_jump %bb.7, implicit-def $pc
bb.7:
successors: %bb.2(0x7c000000), %bb.8(0x04000000)
liveins: $p0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8
renamable $r7 = nsw A2_addi killed renamable $r7, 2
ENDLOOP1 %bb.2, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
J2_jump %bb.8, implicit-def dead $pc
bb.8:
PS_jmpret $r31, implicit-def dead $pc
...
117 changes: 117 additions & 0 deletions llvm/test/CodeGen/Hexagon/v6-haar-balign32.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
; RUN: llc -march=hexagon -mcpu=hexagonv73 -O2 -mattr=+hvxv73,hvx-length64b < %s | FileCheck %s
; CHECK: .p2align{{.*}}5

; Function Attrs: nounwind
define void @wobble(ptr noalias nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3, ptr noalias nocapture %arg4, i32 %arg5) #0 {
bb:
%ashr = ashr i32 %arg3, 2
%ashr6 = ashr i32 %arg3, 1
%add = add nsw i32 %ashr6, %ashr
%icmp = icmp sgt i32 %arg2, 0
br i1 %icmp, label %bb7, label %bb61

bb7: ; preds = %bb
%sdiv = sdiv i32 %arg1, 64
%icmp8 = icmp sgt i32 %arg1, 63
br label %bb9

bb9: ; preds = %bb57, %bb7
%phi = phi i32 [ 0, %bb7 ], [ %add58, %bb57 ]
%ashr10 = ashr exact i32 %phi, 1
%mul = mul nsw i32 %ashr10, %arg3
br i1 %icmp8, label %bb11, label %bb57

bb11: ; preds = %bb9
%add12 = add nsw i32 %phi, 1
%mul13 = mul nsw i32 %add12, %arg5
%mul14 = mul nsw i32 %phi, %arg5
%add15 = add i32 %add, %mul
%add16 = add i32 %mul, %ashr
%add17 = add i32 %mul, %ashr6
%getelementptr = getelementptr inbounds i8, ptr %arg4, i32 %mul13
%getelementptr18 = getelementptr inbounds i8, ptr %arg4, i32 %mul14
%getelementptr19 = getelementptr inbounds i16, ptr %arg, i32 %add15
%getelementptr20 = getelementptr inbounds i16, ptr %arg, i32 %add16
%getelementptr21 = getelementptr inbounds i16, ptr %arg, i32 %add17
%getelementptr22 = getelementptr inbounds i16, ptr %arg, i32 %mul
%bitcast = bitcast ptr %getelementptr to ptr
%bitcast23 = bitcast ptr %getelementptr18 to ptr
%bitcast24 = bitcast ptr %getelementptr19 to ptr
%bitcast25 = bitcast ptr %getelementptr20 to ptr
%bitcast26 = bitcast ptr %getelementptr21 to ptr
%bitcast27 = bitcast ptr %getelementptr22 to ptr
br label %bb28

bb28: ; preds = %bb28, %bb11
%phi29 = phi i32 [ 0, %bb11 ], [ %add54, %bb28 ]
%phi30 = phi ptr [ %bitcast27, %bb11 ], [ %getelementptr36, %bb28 ]
%phi31 = phi ptr [ %bitcast26, %bb11 ], [ %getelementptr37, %bb28 ]
%phi32 = phi ptr [ %bitcast25, %bb11 ], [ %getelementptr39, %bb28 ]
%phi33 = phi ptr [ %bitcast24, %bb11 ], [ %getelementptr41, %bb28 ]
%phi34 = phi ptr [ %bitcast, %bb11 ], [ %getelementptr53, %bb28 ]
%phi35 = phi ptr [ %bitcast23, %bb11 ], [ %getelementptr52, %bb28 ]
%getelementptr36 = getelementptr inbounds <16 x i32>, ptr %phi30, i32 1
%load = load <16 x i32>, ptr %phi30, align 64, !tbaa !1
%getelementptr37 = getelementptr inbounds <16 x i32>, ptr %phi31, i32 1
%load38 = load <16 x i32>, ptr %phi31, align 64, !tbaa !1
%getelementptr39 = getelementptr inbounds <16 x i32>, ptr %phi32, i32 1
%load40 = load <16 x i32>, ptr %phi32, align 64, !tbaa !1
%getelementptr41 = getelementptr inbounds <16 x i32>, ptr %phi33, i32 1
%load42 = load <16 x i32>, ptr %phi33, align 64, !tbaa !1
%call = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load, <16 x i32> %load38)
%call43 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load, <16 x i32> %load38)
%call44 = tail call <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32> %load40, <16 x i32> %load42)
%call45 = tail call <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32> %load40, <16 x i32> %load42)
%call46 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call, <16 x i32> %call44)
%call47 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call, <16 x i32> %call44)
%call48 = tail call <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32> %call43, <16 x i32> %call45)
%call49 = tail call <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32> %call43, <16 x i32> %call45)
%call50 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call47, <16 x i32> %call46)
%call51 = tail call <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32> %call49, <16 x i32> %call48)
%getelementptr52 = getelementptr inbounds <16 x i32>, ptr %phi35, i32 1
store <16 x i32> %call50, ptr %phi35, align 64, !tbaa !1
%getelementptr53 = getelementptr inbounds <16 x i32>, ptr %phi34, i32 1
store <16 x i32> %call51, ptr %phi34, align 64, !tbaa !1
%add54 = add nsw i32 %phi29, 1
%icmp55 = icmp slt i32 %add54, %sdiv
br i1 %icmp55, label %bb28, label %bb56

bb56: ; preds = %bb28
br label %bb57

bb57: ; preds = %bb56, %bb9
%add58 = add nsw i32 %phi, 2
%icmp59 = icmp slt i32 %add58, %arg2
br i1 %icmp59, label %bb9, label %bb60

bb60: ; preds = %bb57
br label %bb61

bb61: ; preds = %bb60, %bb
ret void
}

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <16 x i32> @llvm.hexagon.V6.vaddh(<16 x i32>, <16 x i32>) #1

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <16 x i32> @llvm.hexagon.V6.vsubh(<16 x i32>, <16 x i32>) #1

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <16 x i32> @llvm.hexagon.V6.vavgh(<16 x i32>, <16 x i32>) #1

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <16 x i32> @llvm.hexagon.V6.vnavgh(<16 x i32>, <16 x i32>) #1

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
declare <16 x i32> @llvm.hexagon.V6.vsathub(<16 x i32>, <16 x i32>) #1

attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }

!llvm.ident = !{!0}

!0 = !{!"Clang 3.1"}
!1 = !{!2, !2, i64 0}
!2 = !{!"omnipotent char", !3, i64 0}
!3 = !{!"Simple C/C++ TBAA"}
3 changes: 3 additions & 0 deletions llvm/test/Transforms/LoopRotate/oz-disable.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
; RUN: opt < %s -S -passes='default<Os>' -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OS
; RUN: opt < %s -S -passes='default<Oz>' -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OZ

;; Make sure -allow-loop-header-duplication overrides the default behavior at Oz
; RUN: opt < %s -S -passes='default<Oz>' -enable-loop-header-duplication -debug -debug-only=loop-rotate 2>&1 | FileCheck %s -check-prefix=OS

; Loop should be rotated for -Os but not for -Oz.
; OS: rotating Loop at depth 1
; OZ-NOT: rotating Loop at depth 1
Expand Down
3 changes: 3 additions & 0 deletions llvm/test/Transforms/MemProfContextDisambiguation/inlined3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ attributes #7 = { builtin }
; IR: define {{.*}} @_Z1Mv.memprof.1()
; IR: call {{.*}} @_Z2XZv.memprof.1()

; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }

; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3

;; Check that -enable-loop-header-duplication at Oz enables certain types of
;; optimizations, for example replacing the loop body w/ a call to memset. If
;; loop idiom recognition begins to recognize unrotated loops, this test will
;; need to be updated.

; RUN: opt -passes='default<Oz>' -S < %s | FileCheck %s --check-prefix=NOROTATION
; RUN: opt -passes='default<Oz>' -S -enable-loop-header-duplication < %s | FileCheck %s --check-prefix=ROTATION
; RUN: opt -passes='default<O2>' -S < %s | FileCheck %s --check-prefix=ROTATION

define void @test(i8* noalias nonnull align 1 %start, i8* %end) unnamed_addr {
; NOROTATION-LABEL: define void @test(
; NOROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] {
; NOROTATION-NEXT: entry:
; NOROTATION-NEXT: br label [[LOOP_HEADER:%.*]]
; NOROTATION: loop.header:
; NOROTATION-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], [[ENTRY:%.*]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
; NOROTATION-NEXT: [[_12_I:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
; NOROTATION-NEXT: br i1 [[_12_I]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
; NOROTATION: loop.latch:
; NOROTATION-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 1
; NOROTATION-NEXT: store i8 1, ptr [[PTR_IV]], align 1
; NOROTATION-NEXT: br label [[LOOP_HEADER]]
; NOROTATION: exit:
; NOROTATION-NEXT: ret void
;
; ROTATION-LABEL: define void @test(
; ROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] {
; ROTATION-NEXT: entry:
; ROTATION-NEXT: [[_12_I1:%.*]] = icmp eq ptr [[START]], [[END]]
; ROTATION-NEXT: br i1 [[_12_I1]], label [[EXIT:%.*]], label [[LOOP_LATCH_PREHEADER:%.*]]
; ROTATION: loop.latch.preheader:
; ROTATION-NEXT: [[END3:%.*]] = ptrtoint ptr [[END]] to i64
; ROTATION-NEXT: [[START4:%.*]] = ptrtoint ptr [[START]] to i64
; ROTATION-NEXT: [[TMP0:%.*]] = sub i64 [[END3]], [[START4]]
; ROTATION-NEXT: tail call void @llvm.memset.p0.i64(ptr nonnull align 1 [[START]], i8 1, i64 [[TMP0]], i1 false)
; ROTATION-NEXT: br label [[EXIT]]
; ROTATION: exit:
; ROTATION-NEXT: ret void
;
entry:
br label %loop.header

loop.header:
%ptr.iv = phi i8* [ %start, %entry ], [ %ptr.iv.next, %loop.latch ]
%_12.i = icmp eq i8* %ptr.iv, %end
br i1 %_12.i, label %exit, label %loop.latch

loop.latch:
%ptr.iv.next = getelementptr inbounds i8, i8* %ptr.iv, i64 1
store i8 1, i8* %ptr.iv, align 1
br label %loop.header

exit:
ret void
}
11 changes: 8 additions & 3 deletions llvm/tools/llvm-profgen/PerfReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "PerfReader.h"
#include "ProfileGenerator.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Process.h"
Expand Down Expand Up @@ -361,8 +362,11 @@ PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary,
exitWithError("Perf not found.");
}
std::string PerfPath = *PerfExecutable;
std::string PerfTraceFile = PerfData.str() + ".script.tmp";
std::string ErrorFile = PerfData.str() + ".script.err.tmp";

SmallString<128> PerfTraceFile;
sys::fs::createUniquePath("perf-script-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%.tmp",
PerfTraceFile, /*MakeAbsolute=*/true);
std::string ErrorFile = std::string(PerfTraceFile) + ".err";
StringRef ScriptMMapArgs[] = {PerfPath, "script", "--show-mmap-events",
"-F", "comm,pid", "-i",
PerfData};
Expand Down Expand Up @@ -400,7 +404,8 @@ PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary,
PIDs, "-i", PerfData};
sys::ExecuteAndWait(PerfPath, ScriptSampleArgs, std::nullopt, Redirects);

return {PerfTraceFile, PerfFormat::PerfScript, PerfContent::UnknownContent};
return {std::string(PerfTraceFile), PerfFormat::PerfScript,
PerfContent::UnknownContent};
}

void PerfScriptReader::updateBinaryAddress(const MMapEvent &Event) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ static_library("modernize") {
"UseBoolLiteralsCheck.cpp",
"UseConstraintsCheck.cpp",
"UseDefaultMemberInitCheck.cpp",
"UseDesignatedInitializersCheck.cpp",
"UseEmplaceCheck.cpp",
"UseEqualsDefaultCheck.cpp",
"UseEqualsDeleteCheck.cpp",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ static_library("utils") {
"ASTUtils.cpp",
"Aliasing.cpp",
"DeclRefExprUtils.cpp",
"DesignatedInitializers.cpp",
"ExceptionAnalyzer.cpp",
"ExceptionSpecAnalyzer.cpp",
"ExprSequence.cpp",
Expand Down
1 change: 1 addition & 0 deletions llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ static_library("LLVMHexagonCodeGen") {
"HexagonISelLowering.cpp",
"HexagonISelLoweringHVX.cpp",
"HexagonInstrInfo.cpp",
"HexagonLoopAlign.cpp",
"HexagonLoopIdiomRecognition.cpp",
"HexagonMCInstLower.cpp",
"HexagonMachineFunctionInfo.cpp",
Expand Down
7 changes: 1 addition & 6 deletions mlir/include/mlir/Query/Matcher/ErrorBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,8 @@ enum class ErrorType {
None,

// Parser Errors
ParserChainedExprInvalidArg,
ParserChainedExprNoCloseParen,
ParserChainedExprNoOpenParen,
ParserFailedToBuildMatcher,
ParserInvalidToken,
ParserMalformedChainedExpr,
ParserNoCloseParen,
ParserNoCode,
ParserNoComma,
Expand All @@ -54,10 +50,9 @@ enum class ErrorType {

// Registry Errors
RegistryMatcherNotFound,
RegistryNotBindable,
RegistryValueNotFound,
RegistryWrongArgCount,
RegistryWrongArgType,
RegistryWrongArgType
};

void addError(Diagnostics *error, SourceRange range, ErrorType errorType,
Expand Down
7 changes: 0 additions & 7 deletions mlir/include/mlir/Query/Matcher/MatchersInternal.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,8 @@ class DynMatcher {

bool match(Operation *op) const { return implementation->match(op); }

void setFunctionName(StringRef name) { functionName = name.str(); };

bool hasFunctionName() const { return !functionName.empty(); };

StringRef getFunctionName() const { return functionName; };

private:
llvm::IntrusiveRefCntPtr<MatcherInterface> implementation;
std::string functionName;
};

} // namespace mlir::query::matcher
Expand Down
10 changes: 0 additions & 10 deletions mlir/lib/Query/Matcher/Diagnostics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ static llvm::StringRef errorTypeToFormatString(ErrorType type) {
return "Incorrect type for arg $0. (Expected = $1) != (Actual = $2)";
case ErrorType::RegistryValueNotFound:
return "Value not found: $0";
case ErrorType::RegistryNotBindable:
return "Matcher does not support binding.";

case ErrorType::ParserStringError:
return "Error parsing string token: <$0>";
Expand All @@ -59,14 +57,6 @@ static llvm::StringRef errorTypeToFormatString(ErrorType type) {
return "Unexpected end of code.";
case ErrorType::ParserOverloadedType:
return "Input value has unresolved overloaded type: $0";
case ErrorType::ParserMalformedChainedExpr:
return "Period not followed by valid chained call.";
case ErrorType::ParserChainedExprInvalidArg:
return "Missing/Invalid argument for the chained call.";
case ErrorType::ParserChainedExprNoCloseParen:
return "Missing ')' for the chained call.";
case ErrorType::ParserChainedExprNoOpenParen:
return "Missing '(' for the chained call.";
case ErrorType::ParserFailedToBuildMatcher:
return "Failed to build matcher: $0.";

Expand Down
67 changes: 5 additions & 62 deletions mlir/lib/Query/Matcher/Parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,12 @@ struct Parser::TokenInfo {
text = newText;
}

// Known identifiers.
static const char *const ID_Extract;

llvm::StringRef text;
TokenKind kind = TokenKind::Eof;
SourceRange range;
VariantValue value;
};

const char *const Parser::TokenInfo::ID_Extract = "extract";

class Parser::CodeTokenizer {
public:
// Constructor with matcherCode and error
Expand Down Expand Up @@ -303,36 +298,6 @@ bool Parser::parseIdentifierPrefixImpl(VariantValue *value) {
return parseMatcherExpressionImpl(nameToken, openToken, ctor, value);
}

bool Parser::parseChainedExpression(std::string &argument) {
// Parse the parenthesized argument to .extract("foo")
// Note: EOF is handled inside the consume functions and would fail below when
// checking token kind.
const TokenInfo openToken = tokenizer->consumeNextToken();
const TokenInfo argumentToken = tokenizer->consumeNextTokenIgnoreNewlines();
const TokenInfo closeToken = tokenizer->consumeNextTokenIgnoreNewlines();

if (openToken.kind != TokenKind::OpenParen) {
error->addError(openToken.range, ErrorType::ParserChainedExprNoOpenParen);
return false;
}

if (argumentToken.kind != TokenKind::Literal ||
!argumentToken.value.isString()) {
error->addError(argumentToken.range,
ErrorType::ParserChainedExprInvalidArg);
return false;
}

if (closeToken.kind != TokenKind::CloseParen) {
error->addError(closeToken.range, ErrorType::ParserChainedExprNoCloseParen);
return false;
}

// If all checks passed, extract the argument and return true.
argument = argumentToken.value.getString();
return true;
}

// Parse the arguments of a matcher
bool Parser::parseMatcherArgs(std::vector<ParserValue> &args, MatcherCtor ctor,
const TokenInfo &nameToken, TokenInfo &endToken) {
Expand Down Expand Up @@ -399,34 +364,13 @@ bool Parser::parseMatcherExpressionImpl(const TokenInfo &nameToken,
return false;
}

std::string functionName;
if (tokenizer->peekNextToken().kind == TokenKind::Period) {
tokenizer->consumeNextToken();
TokenInfo chainCallToken = tokenizer->consumeNextToken();
if (chainCallToken.kind == TokenKind::CodeCompletion) {
addCompletion(chainCallToken, MatcherCompletion("extract(\"", "extract"));
return false;
}

if (chainCallToken.kind != TokenKind::Ident ||
chainCallToken.text != TokenInfo::ID_Extract) {
error->addError(chainCallToken.range,
ErrorType::ParserMalformedChainedExpr);
return false;
}

if (chainCallToken.text == TokenInfo::ID_Extract &&
!parseChainedExpression(functionName))
return false;
}

if (!ctor)
return false;
// Merge the start and end infos.
SourceRange matcherRange = nameToken.range;
matcherRange.end = endToken.range.end;
VariantMatcher result = sema->actOnMatcherExpression(
*ctor, matcherRange, functionName, args, error);
VariantMatcher result =
sema->actOnMatcherExpression(*ctor, matcherRange, args, error);
if (result.isNull())
return false;
*value = result;
Expand Down Expand Up @@ -526,10 +470,9 @@ Parser::RegistrySema::lookupMatcherCtor(llvm::StringRef matcherName) {
}

VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
MatcherCtor ctor, SourceRange nameRange, llvm::StringRef functionName,
llvm::ArrayRef<ParserValue> args, Diagnostics *error) {
return RegistryManager::constructMatcher(ctor, nameRange, functionName, args,
error);
MatcherCtor ctor, SourceRange nameRange, llvm::ArrayRef<ParserValue> args,
Diagnostics *error) {
return RegistryManager::constructMatcher(ctor, nameRange, args, error);
}

std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes(
Expand Down
18 changes: 8 additions & 10 deletions mlir/lib/Query/Matcher/Parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,10 @@ class Parser {

// Process a matcher expression. The caller takes ownership of the Matcher
// object returned.
virtual VariantMatcher actOnMatcherExpression(
MatcherCtor ctor, SourceRange nameRange, llvm::StringRef functionName,
llvm::ArrayRef<ParserValue> args, Diagnostics *error) = 0;
virtual VariantMatcher
actOnMatcherExpression(MatcherCtor ctor, SourceRange nameRange,
llvm::ArrayRef<ParserValue> args,
Diagnostics *error) = 0;

// Look up a matcher by name in the matcher name found by the parser.
virtual std::optional<MatcherCtor>
Expand All @@ -92,11 +93,10 @@ class Parser {
std::optional<MatcherCtor>
lookupMatcherCtor(llvm::StringRef matcherName) override;

VariantMatcher actOnMatcherExpression(MatcherCtor Ctor,
SourceRange NameRange,
StringRef functionName,
ArrayRef<ParserValue> Args,
Diagnostics *Error) override;
VariantMatcher actOnMatcherExpression(MatcherCtor ctor,
SourceRange nameRange,
llvm::ArrayRef<ParserValue> args,
Diagnostics *error) override;

std::vector<ArgKind> getAcceptedCompletionTypes(
llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> context) override;
Expand Down Expand Up @@ -153,8 +153,6 @@ class Parser {
Parser(CodeTokenizer *tokenizer, const Registry &matcherRegistry,
const NamedValueMap *namedValues, Diagnostics *error);

bool parseChainedExpression(std::string &argument);

bool parseExpressionImpl(VariantValue *value);

bool parseMatcherArgs(std::vector<ParserValue> &args, MatcherCtor ctor,
Expand Down
15 changes: 2 additions & 13 deletions mlir/lib/Query/Matcher/RegistryManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,19 +132,8 @@ RegistryManager::getMatcherCompletions(llvm::ArrayRef<ArgKind> acceptedTypes,

VariantMatcher RegistryManager::constructMatcher(
MatcherCtor ctor, internal::SourceRange nameRange,
llvm::StringRef functionName, llvm::ArrayRef<ParserValue> args,
internal::Diagnostics *error) {
VariantMatcher out = ctor->create(nameRange, args, error);
if (functionName.empty() || out.isNull())
return out;

if (std::optional<DynMatcher> result = out.getDynMatcher()) {
result->setFunctionName(functionName);
return VariantMatcher::SingleMatcher(*result);
}

error->addError(nameRange, internal::ErrorType::RegistryNotBindable);
return {};
llvm::ArrayRef<ParserValue> args, internal::Diagnostics *error) {
return ctor->create(nameRange, args, error);
}

} // namespace mlir::query::matcher
1 change: 0 additions & 1 deletion mlir/lib/Query/Matcher/RegistryManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ class RegistryManager {

static VariantMatcher constructMatcher(MatcherCtor ctor,
internal::SourceRange nameRange,
llvm::StringRef functionName,
ArrayRef<ParserValue> args,
internal::Diagnostics *error);
};
Expand Down
80 changes: 1 addition & 79 deletions mlir/lib/Query/Query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

#include "mlir/Query/Query.h"
#include "QueryParser.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/Query/Matcher/MatchFinder.h"
#include "mlir/Query/QuerySession.h"
#include "mlir/Support/LogicalResult.h"
Expand All @@ -36,70 +34,6 @@ static void printMatch(llvm::raw_ostream &os, QuerySession &qs, Operation *op,
"\"" + binding + "\" binds here");
}

// TODO: Extract into a helper function that can be reused outside query
// context.
static Operation *extractFunction(std::vector<Operation *> &ops,
MLIRContext *context,
llvm::StringRef functionName) {
context->loadDialect<func::FuncDialect>();
OpBuilder builder(context);

// Collect data for function creation
std::vector<Operation *> slice;
std::vector<Value> values;
std::vector<Type> outputTypes;

for (auto *op : ops) {
// Return op's operands are propagated, but the op itself isn't needed.
if (!isa<func::ReturnOp>(op))
slice.push_back(op);

// All results are returned by the extracted function.
outputTypes.insert(outputTypes.end(), op->getResults().getTypes().begin(),
op->getResults().getTypes().end());

// Track all values that need to be taken as input to function.
values.insert(values.end(), op->getOperands().begin(),
op->getOperands().end());
}

// Create the function
FunctionType funcType =
builder.getFunctionType(ValueRange(values), outputTypes);
auto loc = builder.getUnknownLoc();
func::FuncOp funcOp = func::FuncOp::create(loc, functionName, funcType);

builder.setInsertionPointToEnd(funcOp.addEntryBlock());

// Map original values to function arguments
IRMapping mapper;
for (const auto &arg : llvm::enumerate(values))
mapper.map(arg.value(), funcOp.getArgument(arg.index()));

// Clone operations and build function body
std::vector<Operation *> clonedOps;
std::vector<Value> clonedVals;
for (Operation *slicedOp : slice) {
Operation *clonedOp =
clonedOps.emplace_back(builder.clone(*slicedOp, mapper));
clonedVals.insert(clonedVals.end(), clonedOp->result_begin(),
clonedOp->result_end());
}
// Add return operation
builder.create<func::ReturnOp>(loc, clonedVals);

// Remove unused function arguments
size_t currentIndex = 0;
while (currentIndex < funcOp.getNumArguments()) {
if (funcOp.getArgument(currentIndex).use_empty())
funcOp.eraseArgument(currentIndex);
else
++currentIndex;
}

return funcOp;
}

Query::~Query() = default;

mlir::LogicalResult InvalidQuery::run(llvm::raw_ostream &os,
Expand Down Expand Up @@ -131,21 +65,9 @@ mlir::LogicalResult QuitQuery::run(llvm::raw_ostream &os,

mlir::LogicalResult MatchQuery::run(llvm::raw_ostream &os,
QuerySession &qs) const {
Operation *rootOp = qs.getRootOp();
int matchCount = 0;
std::vector<Operation *> matches =
matcher::MatchFinder().getMatches(rootOp, matcher);

// An extract call is recognized by considering if the matcher has a name.
// TODO: Consider making the extract more explicit.
if (matcher.hasFunctionName()) {
auto functionName = matcher.getFunctionName();
Operation *function =
extractFunction(matches, rootOp->getContext(), functionName);
os << "\n" << *function << "\n\n";
return mlir::success();
}

matcher::MatchFinder().getMatches(qs.getRootOp(), matcher);
os << "\n";
for (Operation *op : matches) {
os << "Match #" << ++matchCount << ":\n\n";
Expand Down
23 changes: 14 additions & 9 deletions mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_loose.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
// DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
// DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
// DEFINE: %{run_opts} = -e main -entry-point-result=void
// DEFINE: %{run_opts} = -e entry -entry-point-result=void
// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
//
Expand All @@ -28,7 +28,7 @@
}>

module {
func.func @main() {
func.func @entry() {
%c0 = arith.constant 0 : index
%f0 = arith.constant 0.0 : f64
%d = arith.constant dense<[[ 1.0, 2.0, 3.0, 4.0 ],
Expand All @@ -39,14 +39,19 @@ module {
%s = sparse_tensor.convert %d : tensor<5x4xf64> to tensor<5x4xf64, #CSR_hi>

//
// CHECK: ---- Sparse Tensor ----
// CHECK-NEXT: nse = 17
// CHECK-NEXT: pos[1] : ( 0, 4, 4, 8, 8, 9, 9, 13
// CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 2, 0, 1, 2, 3, 0, 1, 2, 3
// CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8, 5.5, 9, 10, 11, 12, 13, 14, 15, 16
// CHECK-NEXT: ----
// CHECK: ( 0, 4, 4, 8, 8, 9, 9, 13 )
// CHECK-NEXT: ( 0, 1, 2, 3, 0, 1, 2, 3, 2, 0, 1, 2, 3, 0, 1, 2, 3 )
// CHECK-NEXT: ( 1, 2, 3, 4, 5, 6, 7, 8, 5.5, 9, 10, 11, 12, 13, 14, 15, 16 )
//
sparse_tensor.print %s : tensor<5x4xf64, #CSR_hi>
%pos = sparse_tensor.positions %s {level = 1 : index } : tensor<5x4xf64, #CSR_hi> to memref<?xindex>
%vecp = vector.transfer_read %pos[%c0], %c0 : memref<?xindex>, vector<8xindex>
vector.print %vecp : vector<8xindex>
%crd = sparse_tensor.coordinates %s {level = 1 : index } : tensor<5x4xf64, #CSR_hi> to memref<?xindex>
%vecc = vector.transfer_read %crd[%c0], %c0 : memref<?xindex>, vector<17xindex>
vector.print %vecc : vector<17xindex>
%val = sparse_tensor.values %s : tensor<5x4xf64, #CSR_hi> to memref<?xf64>
%vecv = vector.transfer_read %val[%c0], %f0 : memref<?xf64>, vector<17xf64>
vector.print %vecv : vector<17xf64>

// Release the resources.
bufferization.dealloc_tensor %s: tensor<5x4xf64, #CSR_hi>
Expand Down
Loading