1 change: 0 additions & 1 deletion libc/test/src/sys/mman/linux/mincore_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#include "src/sys/mman/munmap.h"
#include "src/unistd/sysconf.h"
#include "test/UnitTest/ErrnoSetterMatcher.h"
#include "test/UnitTest/LibcTest.h"
#include "test/UnitTest/Test.h"

#include <sys/mman.h>
Expand Down
1 change: 0 additions & 1 deletion libc/test/src/sys/mman/linux/mlock_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include "src/sys/resource/getrlimit.h"
#include "src/unistd/sysconf.h"
#include "test/UnitTest/ErrnoSetterMatcher.h"
#include "test/UnitTest/LibcTest.h"
#include "test/UnitTest/Test.h"

#include <linux/capability.h>
Expand Down
1 change: 0 additions & 1 deletion libc/test/src/sys/mman/linux/msync_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
#include "src/sys/mman/munmap.h"
#include "src/unistd/sysconf.h"
#include "test/UnitTest/ErrnoSetterMatcher.h"
#include "test/UnitTest/LibcTest.h"
#include "test/UnitTest/Test.h"

using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
Expand Down
2 changes: 1 addition & 1 deletion libc/test/src/sys/mman/linux/shm_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include "src/unistd/close.h"
#include "src/unistd/ftruncate.h"
#include "test/UnitTest/ErrnoSetterMatcher.h"
#include "test/UnitTest/LibcTest.h"
#include "test/UnitTest/Test.h"
#include <sys/syscall.h>

using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
Expand Down
1 change: 0 additions & 1 deletion libc/test/src/unistd/access_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
#include "src/unistd/close.h"
#include "src/unistd/unlink.h"
#include "test/UnitTest/ErrnoSetterMatcher.h"
#include "test/UnitTest/LibcTest.h"
#include "test/UnitTest/Test.h"

#include <sys/stat.h>
Expand Down
322 changes: 0 additions & 322 deletions libcxx/test/support/experimental_any_helpers.h

This file was deleted.

3 changes: 3 additions & 0 deletions lldb/include/lldb/Target/Target.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "lldb/Utility/RealpathPrefixes.h"
#include "lldb/Utility/Timeout.h"
#include "lldb/lldb-public.h"
#include "llvm/ADT/StringRef.h"

namespace lldb_private {

Expand Down Expand Up @@ -114,6 +115,8 @@ class TargetProperties : public Properties {

void SetDisableSTDIO(bool b);

llvm::StringRef GetLaunchWorkingDirectory() const;

const char *GetDisassemblyFlavor() const;

InlineStrategy GetInlineStrategy() const;
Expand Down
7 changes: 7 additions & 0 deletions lldb/source/Commands/CommandObjectProcess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,13 @@ class CommandObjectProcessLaunch : public CommandObjectProcessLaunchOrAttach {
if (target->GetDisableSTDIO())
m_options.launch_info.GetFlags().Set(eLaunchFlagDisableSTDIO);

if (!m_options.launch_info.GetWorkingDirectory()) {
if (llvm::StringRef wd = target->GetLaunchWorkingDirectory();
!wd.empty()) {
m_options.launch_info.SetWorkingDirectory(FileSpec(wd));
}
}

// Merge the launch info environment with the target environment.
Environment target_env = target->GetEnvironment();
m_options.launch_info.GetEnvironment().insert(target_env.begin(),
Expand Down
5 changes: 4 additions & 1 deletion lldb/source/Commands/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -691,7 +691,10 @@ let Command = "process launch" in {
def process_launch_plugin : Option<"plugin", "P">, Arg<"Plugin">,
Desc<"Name of the process plugin you want to use.">;
def process_launch_working_dir : Option<"working-dir", "w">, Arg<"DirectoryName">,
Desc<"Set the current working directory to <path> when running the inferior.">;
Desc<"Set the current working directory to <path> when running the inferior. This option "
"applies only to the current `process launch` invocation. If "
"`target.launch-working-dir` is set and this option is given, the value of this "
"option will be used instead of the setting.">;
def process_launch_arch : Option<"arch", "a">, Arg<"Architecture">,
Desc<"Set the architecture for the process to launch when ambiguous.">;
def process_launch_environment : Option<"environment", "E">,
Expand Down
9 changes: 6 additions & 3 deletions lldb/source/Symbol/CompileUnit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,14 +328,17 @@ void CompileUnit::ResolveSymbolContext(
// We will miss functions that ONLY exist as a call site entry.

if (line_entry.IsValid() &&
(line_entry.line != line || line_entry.column != column_num) &&
resolve_scope & eSymbolContextLineEntry && check_inlines) {
(line_entry.line != line ||
(column_num != 0 && line_entry.column != column_num)) &&
(resolve_scope & eSymbolContextLineEntry) && check_inlines) {
// We don't move lines over function boundaries, so the address in the
// line entry will be the in function that contained the line that might
// be a CallSite, and we can just iterate over that function to find any
// inline records, and dig up their call sites.
Address start_addr = line_entry.range.GetBaseAddress();
Function *function = start_addr.CalculateSymbolContextFunction();
// Record the size of the list to see if we added to it:
size_t old_sc_list_size = sc_list.GetSize();

Declaration sought_decl(file_spec, line, column_num);
// We use this recursive function to descend the block structure looking
Expand Down Expand Up @@ -417,7 +420,7 @@ void CompileUnit::ResolveSymbolContext(
// FIXME: Should I also do this for "call site line exists between the
// given line number and the later line we found in the line table"? That's
// a closer approximation to our general sliding algorithm.
if (sc_list.GetSize())
if (sc_list.GetSize() > old_sc_list_size)
return;
}

Expand Down
5 changes: 5 additions & 0 deletions lldb/source/Target/Target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4471,6 +4471,11 @@ void TargetProperties::SetDisableSTDIO(bool b) {
const uint32_t idx = ePropertyDisableSTDIO;
SetPropertyAtIndex(idx, b);
}
llvm::StringRef TargetProperties::GetLaunchWorkingDirectory() const {
const uint32_t idx = ePropertyLaunchWorkingDir;
return GetPropertyAtIndexAs<llvm::StringRef>(
idx, g_target_properties[idx].default_cstr_value);
}

const char *TargetProperties::GetDisassemblyFlavor() const {
const uint32_t idx = ePropertyDisassemblyFlavor;
Expand Down
7 changes: 7 additions & 0 deletions lldb/source/Target/TargetProperties.td
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,13 @@ let Definition = "target" in {
def DebugUtilityExpression: Property<"debug-utility-expression", "Boolean">,
DefaultFalse,
Desc<"Enable debugging of LLDB-internal utility expressions.">;
def LaunchWorkingDir: Property<"launch-working-dir", "String">,
DefaultStringValue<"">,
Desc<"A default value for the working directory to use when launching processes. "
"It is ignored when empty. This setting is only used when the target is "
"launched. If you change this setting, the new value will only apply to "
"subsequent launches. Commands that take an explicit working directory "
"will override this setting.">;
}

let Definition = "process_experimental" in {
Expand Down
50 changes: 47 additions & 3 deletions lldb/source/Target/ThreadPlanStepRange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -379,10 +379,10 @@ bool ThreadPlanStepRange::SetNextBranchBreakpoint() {
!m_next_branch_bp_sp->HasResolvedLocations())
m_could_not_resolve_hw_bp = true;

BreakpointLocationSP bp_loc =
m_next_branch_bp_sp->GetLocationAtIndex(0);
if (log) {
lldb::break_id_t bp_site_id = LLDB_INVALID_BREAK_ID;
BreakpointLocationSP bp_loc =
m_next_branch_bp_sp->GetLocationAtIndex(0);
if (bp_loc) {
BreakpointSiteSP bp_site = bp_loc->GetBreakpointSite();
if (bp_site) {
Expand All @@ -395,7 +395,51 @@ bool ThreadPlanStepRange::SetNextBranchBreakpoint() {
m_next_branch_bp_sp->GetID(), bp_site_id,
run_to_address.GetLoadAddress(&m_process.GetTarget()));
}

// The "next branch breakpoint might land on a virtual inlined call
// stack. If that's true, we should always stop at the top of the
// inlined call stack. Only virtual steps should walk deeper into the
// inlined call stack.
Block *block = run_to_address.CalculateSymbolContextBlock();
if (bp_loc && block) {
LineEntry top_most_line_entry;
lldb::addr_t run_to_addr = run_to_address.GetFileAddress();
for (Block *inlined_parent = block->GetContainingInlinedBlock();
inlined_parent;
inlined_parent = inlined_parent->GetInlinedParent()) {
AddressRange range;
if (!inlined_parent->GetRangeContainingAddress(run_to_address,
range))
break;
Address range_start_address = range.GetBaseAddress();
// Only compare addresses here, we may have different symbol
// contexts (for virtual inlined stacks), but we just want to know
// that they are all at the same address.
if (range_start_address.GetFileAddress() != run_to_addr)
break;
const InlineFunctionInfo *inline_info =
inlined_parent->GetInlinedFunctionInfo();
if (!inline_info)
break;
const Declaration &call_site = inline_info->GetCallSite();
top_most_line_entry.line = call_site.GetLine();
top_most_line_entry.column = call_site.GetColumn();
FileSpec call_site_file_spec = call_site.GetFile();
top_most_line_entry.original_file_sp.reset(
new SupportFile(call_site_file_spec));
top_most_line_entry.range = range;
top_most_line_entry.file_sp.reset();
top_most_line_entry.ApplyFileMappings(
GetThread().CalculateTarget());
if (!top_most_line_entry.file_sp)
top_most_line_entry.file_sp =
top_most_line_entry.original_file_sp;
}
if (top_most_line_entry.IsValid()) {
LLDB_LOG(log, "Setting preferred line entry: {0}:{1}",
top_most_line_entry.GetFile(), top_most_line_entry.line);
bp_loc->SetPreferredLineEntry(top_most_line_entry);
}
}
m_next_branch_bp_sp->SetThreadID(m_tid);
m_next_branch_bp_sp->SetBreakpointKind("next-branch-location");

Expand Down
58 changes: 58 additions & 0 deletions lldb/test/API/commands/process/launch/TestProcessLaunch.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from lldbsuite.test.decorators import *
from lldbsuite.test.lldbtest import *
from lldbsuite.test import lldbutil
from pathlib import Path


class ProcessLaunchTestCase(TestBase):
Expand Down Expand Up @@ -206,3 +207,60 @@ def test_environment_with_special_char(self):
self.assertEqual(value, evil_var)
process.Continue()
self.assertState(process.GetState(), lldb.eStateExited, PROCESS_EXITED)

@skipIfRemote
def test_target_launch_working_dir_prop(self):
"""Test that the setting `target.launch-working-dir` is correctly used when launching a process."""
d = {"CXX_SOURCES": "print_cwd.cpp"}
self.build(dictionary=d)
self.setTearDownCleanup(d)
exe = self.getBuildArtifact("a.out")
self.runCmd("file " + exe)

mywd = "my_working_dir"
out_file_name = "my_working_dir_test.out"

my_working_dir_path = self.getBuildArtifact(mywd)
lldbutil.mkdir_p(my_working_dir_path)
out_file_path = os.path.join(my_working_dir_path, out_file_name)
another_working_dir_path = Path(
os.path.join(my_working_dir_path, "..")
).resolve()

# If -w is not passed to process launch, then the setting will be used.
self.runCmd(
f"settings set target.launch-working-dir {another_working_dir_path}"
)
launch_command = f"process launch -o {out_file_path}"

self.expect(
launch_command,
patterns=["Process .* launched: .*a.out"],
)

out = lldbutil.read_file_on_target(self, out_file_path)

self.assertIn(f"stdout: {another_working_dir_path}", out)

# If -w is passed to process launch, that value will be used instead of the setting.
launch_command = f"process launch -w {my_working_dir_path} -o {out_file_path}"

self.expect(
launch_command,
patterns=["Process .* launched: .*a.out"],
)

out = lldbutil.read_file_on_target(self, out_file_path)
self.assertIn(f"stdout: {my_working_dir_path}", out)

# If set to empty, then LLDB's cwd will be used to launch the process.
self.runCmd(f"settings set target.launch-working-dir ''")
launch_command = f"process launch -o {out_file_path}"

self.expect(
launch_command,
patterns=["Process .* launched: .*a.out"],
)

out = lldbutil.read_file_on_target(self, out_file_path)
self.assertNotIn(f"stdout: {another_working_dir_path}", out)
19 changes: 19 additions & 0 deletions lldb/test/API/functionalities/breakpoint/same_cu_name/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
CXX_SOURCES := main.cpp
LD_EXTRAS := ns1.o ns2.o ns3.o ns4.o

a.out: main.o ns1.o ns2.o ns3.o ns4.o

ns1.o: common.cpp
$(CC) -g -c -DNAMESPACE=ns1 -o $@ $<

ns2.o: common.cpp
$(CC) -g -c -DNAMESPACE=ns2 -o $@ $<

ns3.o: common.cpp
$(CC) -g -c -DNAMESPACE=ns3 -o $@ $<

ns4.o: common.cpp
$(CC) -g -c -DNAMESPACE=ns4 -o $@ $<


include Makefile.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Test setting a breakpoint by file and line when many instances of the
same file name exist in the CU list.
"""


import lldb
from lldbsuite.test.decorators import *
from lldbsuite.test.lldbtest import *
from lldbsuite.test import lldbutil


class TestBreakpointSameCU(TestBase):
def test_breakpoint_same_cu(self):
self.build()
target = self.createTestTarget()

# Break both on the line before the code:
comment_line = line_number("common.cpp", "// A comment here")
self.assertNotEqual(comment_line, 0, "line_number worked")
bkpt = target.BreakpointCreateByLocation("common.cpp", comment_line)
self.assertEqual(
bkpt.GetNumLocations(), 4, "Got the right number of breakpoints"
)

# And break on the code, both should work:
code_line = line_number("common.cpp", "// The line with code")
self.assertNotEqual(comment_line, 0, "line_number worked again")
bkpt = target.BreakpointCreateByLocation("common.cpp", code_line)
self.assertEqual(
bkpt.GetNumLocations(), 4, "Got the right number of breakpoints"
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
namespace NAMESPACE {
static int g_value = 0;
void DoSomeStuff() {
// A comment here
g_value++; // The line with code
}

} // namespace NAMESPACE
24 changes: 24 additions & 0 deletions lldb/test/API/functionalities/breakpoint/same_cu_name/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
namespace ns1 {
extern void DoSomeStuff();
}

namespace ns2 {
extern void DoSomeStuff();
}

namespace ns3 {
extern void DoSomeStuff();
}

namespace ns4 {
extern void DoSomeStuff();
}

int main(int argc, char *argv[]) {
ns1::DoSomeStuff();
ns2::DoSomeStuff();
ns3::DoSomeStuff();
ns4::DoSomeStuff();

return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ class TestInlineStepping(TestBase):
compiler="icc",
bugnumber="# Not really a bug. ICC combines two inlined functions.",
)
@skipIf(oslist=["linux"], archs=["arm"]) # Fails for 32 bit arm
def test_with_python_api(self):
"""Test stepping over and into inlined functions."""
self.build()
Expand Down Expand Up @@ -291,7 +290,7 @@ def inline_stepping_step_over(self):
break_1_in_main = target.BreakpointCreateBySourceRegex(
"// At second call of caller_ref_1 in main.", self.main_source_spec
)
self.assertTrue(break_1_in_main, VALID_BREAKPOINT)
self.assertGreater(break_1_in_main.GetNumLocations(), 0, VALID_BREAKPOINT)

# Now launch the process, and do not stop at entry point.
self.process = target.LaunchSimple(
Expand All @@ -317,6 +316,24 @@ def inline_stepping_step_over(self):
]
self.run_step_sequence(step_sequence)

# Now make sure that next to a virtual inlined call stack
# gets the call stack depth correct.
break_2_in_main = target.BreakpointCreateBySourceRegex(
"// Call max_value specialized", self.main_source_spec
)
self.assertGreater(break_2_in_main.GetNumLocations(), 0, VALID_BREAKPOINT)
threads = lldbutil.continue_to_breakpoint(self.process, break_2_in_main)
self.assertEqual(len(threads), 1, "Hit our second breakpoint")
self.assertEqual(threads[0].id, self.thread.id, "Stopped at right thread")
self.thread.StepOver()
frame_0 = self.thread.frames[0]
line_entry = frame_0.line_entry
self.assertEqual(
line_entry.file.basename, self.main_source_spec.basename, "File matches"
)
target_line = line_number("calling.cpp", "// At caller_trivial_inline_1")
self.assertEqual(line_entry.line, target_line, "Lines match as well.")

def step_in_template(self):
"""Use Python APIs to test stepping in to templated functions."""
exe = self.getBuildArtifact("a.out")
Expand Down
6 changes: 3 additions & 3 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1645,9 +1645,9 @@ The AMDGPU backend supports the following LLVM IR attributes.
reduced by heuristics.

"amdgpu-max-num-workgroups"="x,y,z" Specify the maximum number of work groups for the kernel dispatch in the
X, Y, and Z dimensions. Generated by the ``amdgpu_max_num_work_groups``
CLANG attribute [CLANG-ATTR]_. Clang only emits this attribute when all
the three numbers are >= 1.
X, Y, and Z dimensions. Each number must be >= 1. Generated by the
``amdgpu_max_num_work_groups`` CLANG attribute [CLANG-ATTR]_. Clang only
emits this attribute when all the three numbers are >= 1.

"amdgpu-no-agpr" Indicates the function will not require allocating AGPRs. This is only
relevant on subtargets with AGPRs. The behavior is undefined if a
Expand Down
3 changes: 2 additions & 1 deletion llvm/docs/LangRef.rst
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,14 @@ And the hard way:
.. code-block:: llvm

%0 = add i32 %X, %X ; yields i32:%0
%1 = add i32 %0, %0 ; yields i32:%1
%1 = add i32 %0, %0 /* yields i32:%1 */
%result = add i32 %1, %1

This last way of multiplying ``%X`` by 8 illustrates several important
lexical features of LLVM:

#. Comments are delimited with a '``;``' and go until the end of line.
Alternatively, comments can start with ``/*`` and terminate with ``*/``.
#. Unnamed temporaries are created when the result of a computation is
not assigned to a named value.
#. By default, unnamed temporaries are numbered sequentially (using a
Expand Down
2 changes: 2 additions & 0 deletions llvm/docs/ReleaseNotes.md
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,8 @@ Changes to LLDB
* Program stdout/stderr redirection will now open the file with O_TRUNC flag, make sure to truncate the file if path already exists.
* eg. `settings set target.output-path/target.error-path <path/to/file>`

* A new setting `target.launch-working-dir` can be used to set a persistent cwd that is used by default by `process launch` and `run`.

Changes to BOLT
---------------------------------

Expand Down
2 changes: 2 additions & 0 deletions llvm/docs/SPIRVUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na
- Provides additional information to a compiler, similar to the llvm.assume and llvm.expect intrinsics.
* - ``SPV_KHR_float_controls``
- Provides new execution modes to control floating-point computations by overriding an implementation’s default behavior for rounding modes, denormals, signed zero, and infinities.
* - ``SPV_KHR_integer_dot_product``
- Adds instructions for dot product operations on integer vectors with optional accumulation. Integer vectors includes 4-component vector of 8 bit integers and 4-component vectors of 8 bit integers packed into 32-bit integers.
* - ``SPV_KHR_linkonce_odr``
- Allows to use the LinkOnceODR linkage type that lets a function or global variable to be merged with other functions or global variables of the same name when linkage occurs.
* - ``SPV_KHR_no_integer_wrap_decoration``
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/AsmParser/LLLexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ namespace llvm {

int getNextChar();
void SkipLineComment();
bool SkipCComment();
lltok::Kind ReadString(lltok::Kind kind);
bool ReadVarName();

Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/IR/IntrinsicsDirectX.td
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def int_dx_udot :
DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
[IntrNoMem, Commutative] >;
def int_dx_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;

def int_dx_frac : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
def int_dx_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/IR/IntrinsicsSPIRV.td
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ let TargetPrefix = "spv" in {
DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
[IntrNoMem, Commutative] >;
def int_spv_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
def int_spv_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
Expand Down
11 changes: 6 additions & 5 deletions llvm/lib/Analysis/LoopAccessAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1511,12 +1511,13 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
if (isNoWrapAddRec(Ptr, AR, PSE, Lp))
return Stride;

// An nusw getelementptr that is a AddRec with a unit stride
// cannot wrap per definition. If it did, the result would be poison
// and any memory access dependent on it would be immediate UB
// when executed.
// An nusw getelementptr that is an AddRec cannot wrap. If it would wrap,
// the distance between the previously accessed location and the wrapped
// location will be larger than half the pointer index type space. In that
// case, the GEP would be poison and any memory access dependent on it would
// be immediate UB when executed.
if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
GEP && GEP->hasNoUnsignedSignedWrap() && (Stride == 1 || Stride == -1))
GEP && GEP->hasNoUnsignedSignedWrap())
return Stride;

// If the null pointer is undefined, then a access sequence which would
Expand Down
29 changes: 28 additions & 1 deletion llvm/lib/AsmParser/LLLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ lltok::Kind LLLexer::LexToken() {
// Handle letters: [a-zA-Z_]
if (isalpha(static_cast<unsigned char>(CurChar)) || CurChar == '_')
return LexIdentifier();

return lltok::Error;
case EOF: return lltok::Eof;
case 0:
Expand Down Expand Up @@ -251,6 +250,12 @@ lltok::Kind LLLexer::LexToken() {
case ',': return lltok::comma;
case '*': return lltok::star;
case '|': return lltok::bar;
case '/':
if (getNextChar() != '*')
return lltok::Error;
if (SkipCComment())
return lltok::Error;
continue;
}
}
}
Expand All @@ -262,6 +267,28 @@ void LLLexer::SkipLineComment() {
}
}

/// This skips C-style /**/ comments. Returns true if there
/// was an error.
bool LLLexer::SkipCComment() {
while (true) {
int CurChar = getNextChar();
switch (CurChar) {
case EOF:
LexError("unterminated comment");
return true;
case '*':
// End of the comment?
CurChar = getNextChar();
if (CurChar == '/')
return false;
if (CurChar == EOF) {
LexError("unterminated comment");
return true;
}
}
}
}

/// Lex all tokens that start with an @ character.
/// GlobalVar @\"[^\"]*\"
/// GlobalVar @[-a-zA-Z$._][-a-zA-Z$._0-9]*
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/CodeGen/XRayInstrumentation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,8 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
prependRetWithPatchableExit(MF, TII, op);
break;
}
case Triple::ArchType::ppc64le: {
case Triple::ArchType::ppc64le:
case Triple::ArchType::systemz: {
// PPC has conditional returns. Turn them into branch and plain returns.
InstrumentationOptions op;
op.HandleTailcall = false;
Expand Down
17 changes: 12 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -504,14 +504,21 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,

Kern[".max_flat_workgroup_size"] =
Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
unsigned NumWGX = MFI.getMaxNumWorkGroupsX();
unsigned NumWGY = MFI.getMaxNumWorkGroupsY();
unsigned NumWGZ = MFI.getMaxNumWorkGroupsZ();
if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {

uint32_t NumWGY = MFI.getMaxNumWorkGroupsY();
uint32_t NumWGZ = MFI.getMaxNumWorkGroupsZ();
uint32_t NumWGX = MFI.getMaxNumWorkGroupsX();

// TODO: Should consider 0 invalid and reject in IR verifier.
if (NumWGX != std::numeric_limits<uint32_t>::max() && NumWGX != 0)
Kern[".max_num_workgroups_x"] = Kern.getDocument()->getNode(NumWGX);

if (NumWGY != std::numeric_limits<uint32_t>::max() && NumWGY != 0)
Kern[".max_num_workgroups_y"] = Kern.getDocument()->getNode(NumWGY);

if (NumWGZ != std::numeric_limits<uint32_t>::max() && NumWGZ != 0)
Kern[".max_num_workgroups_z"] = Kern.getDocument()->getNode(NumWGZ);
}

Kern[".sgpr_spill_count"] =
Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
Kern[".vgpr_spill_count"] =
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,5 +371,6 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct

SmallVector<unsigned>
AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3,
std::numeric_limits<uint32_t>::max());
}
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/MIMGInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1514,9 +1514,9 @@ multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
Gather4 = 1 in {
let VDataDwords = 2 in
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64, /*enableDisasm*/ true>; /* for packed D16 only */
let VDataDwords = 4 in
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
let VDataDwords = 5 in
defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ static unsigned macToMad(unsigned Opc) {
return AMDGPU::V_FMA_F32_e64;
case AMDGPU::V_FMAC_F16_e64:
return AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
return AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
return AMDGPU::V_FMA_LEGACY_F32_e64;
Expand Down
45 changes: 23 additions & 22 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3479,7 +3479,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64) {
Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
if (hasAnyModifiersSet(UseMI))
Expand All @@ -3499,7 +3499,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
bool IsFMA =
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64;
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);

Expand Down Expand Up @@ -3532,16 +3532,16 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,

unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;

// V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
// V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
// would also require restricting their register classes. For now
// just bail out.
if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
return false;

const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
Expand All @@ -3556,8 +3556,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setIsKill(RegSrc->isKill());

if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_e64)
Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));

Expand Down Expand Up @@ -3611,24 +3611,24 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,

unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;

// V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
// V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
// would also require restricting their register classes. For now
// just bail out.
if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
return false;

// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.

if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_e64)
Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));

Expand Down Expand Up @@ -3851,19 +3851,20 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}

assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
"V_FMAC_F16_t16_e32 is not supported and not expected to be present "
"pre-RA");
assert(
Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
"V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
"pre-RA");

// Handle MAC/FMAC.
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64;
Opc == AMDGPU::V_FMAC_F16_fake16_e64;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
Expand All @@ -3877,7 +3878,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return nullptr;
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
case AMDGPU::V_FMAC_F32_e64:
Expand Down Expand Up @@ -3962,7 +3963,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
int64_t Imm;
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
unsigned NewOpc =
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
: AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
Expand All @@ -3981,7 +3982,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
}
}
unsigned NewOpc =
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: AMDGPU::V_FMAMK_F32)
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
Expand Down Expand Up @@ -4436,7 +4437,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
case AMDGPU::V_FMAC_F16_e64:
case AMDGPU::V_FMAC_F16_t16_e64:
case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMAC_F64_e64:
case AMDGPU::V_FMAC_LEGACY_F32_e64:
Expand Down Expand Up @@ -5483,7 +5484,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1755,6 +1755,21 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
1 : VSrc_b32);
}

// Returns the vreg register class to use for sources of VOP3 instructions for the
// given VT.
class getVOP3VRegSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
RegisterOperand ret =
!cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128>,
!eq(VT.Size, 96) : RegisterOperand<VReg_96>,
!eq(VT.Size, 64) : RegisterOperand<VReg_64>,
!eq(VT.Size, 48) : RegisterOperand<VReg_64>,
!eq(VT.Size, 16) : !if(IsTrue16,
!if(IsFake16, RegisterOperand<VGPR_32>,
RegisterOperand<VGPR_16>),
RegisterOperand<VGPR_32>),
1 : RegisterOperand<VGPR_32>);
}

// Src2 of VOP3 DPP instructions cannot be a literal
class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
RegisterOperand ret =
Expand Down
12 changes: 6 additions & 6 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3200,20 +3200,20 @@ def : GCNPat <
let SubtargetPredicate = isGFX10Plus in {
// Don't allow source modifiers. If there are any source modifiers then it's
// better to select fma instead of fmac.
let OtherPredicates = [NotHasTrue16BitInsts] in
let True16Predicate = NotHasTrue16BitInsts in
def : GCNPat <
(fma (f16 (VOP3NoMods f32:$src0)),
(f16 (VOP3NoMods f32:$src1)),
(f16 (VOP3NoMods f32:$src2))),
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
let OtherPredicates = [HasTrue16BitInsts] in
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
(fma (f16 (VOP3NoMods f32:$src0)),
(f16 (VOP3NoMods f32:$src1)),
(f16 (VOP3NoMods f32:$src2))),
(V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
(fma (f16 (VOP3NoMods f16:$src0)),
(f16 (VOP3NoMods f16:$src1)),
(f16 (VOP3NoMods f16:$src2))),
(V_FMAC_F16_fake16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
}
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16;
break;
}
Expand Down Expand Up @@ -484,7 +484,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16;
break;
}
Expand Down
11 changes: 6 additions & 5 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -563,8 +563,8 @@ bool isMAC(unsigned Opc) {
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
Opc == AMDGPU::V_FMAC_DX9_ZERO_F32_e64_gfx11 ||
Opc == AMDGPU::V_FMAC_F16_e64_gfx10 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx11 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx12 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64_gfx11 ||
Opc == AMDGPU::V_FMAC_F16_fake16_e64_gfx12 ||
Opc == AMDGPU::V_DOT2C_F32_F16_e64_vi ||
Opc == AMDGPU::V_DOT2C_I32_I16_e64_vi ||
Opc == AMDGPU::V_DOT4C_I32_I8_e64_vi ||
Expand Down Expand Up @@ -1307,15 +1307,16 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
}

SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
unsigned Size) {
unsigned Size,
unsigned DefaultVal) {
assert(Size > 2);
SmallVector<unsigned> Default(Size, 0);
SmallVector<unsigned> Default(Size, DefaultVal);

Attribute A = F.getFnAttribute(Name);
if (!A.isStringAttribute())
return Default;

SmallVector<unsigned> Vals(Size, 0);
SmallVector<unsigned> Vals(Size, DefaultVal);

LLVMContext &Ctx = F.getContext();

Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -919,7 +919,8 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
///
/// \returns false if any error occurs.
SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
unsigned Size);
unsigned Size,
unsigned DefaultVal = 0);

/// Represents the counter values to wait for in an s_waitcnt instruction.
///
Expand Down
124 changes: 75 additions & 49 deletions llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo

// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let True16Predicate = ps.True16Predicate;
let OtherPredicates = ps.OtherPredicates;
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
Expand Down Expand Up @@ -373,10 +374,10 @@ class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> {
}

def VOP_MADAK_F16 : VOP_MADAK <f16>;
def VOP_MADAK_F16_t16 : VOP_MADAK <f16> {
def VOP_MADAK_F16_fake16 : VOP_MADAK <f16> {
let IsTrue16 = 1;
let DstRC = VOPDstOperand<VGPR_32_Lo128>;
let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, VGPR_32_Lo128:$src1, ImmOpType:$imm);
let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, VGPRSrc_32_Lo128:$src1, ImmOpType:$imm);
}
def VOP_MADAK_F32 : VOP_MADAK <f32>;

Expand All @@ -398,18 +399,20 @@ class VOP_MADMK <ValueType vt> : VOP_MADK_Base<vt> {
}

def VOP_MADMK_F16 : VOP_MADMK <f16>;
def VOP_MADMK_F16_t16 : VOP_MADMK <f16> {
def VOP_MADMK_F16_fake16 : VOP_MADMK <f16> {
let IsTrue16 = 1;
let DstRC = VOPDstOperand<VGPR_32_Lo128>;
let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPR_32_Lo128:$src1);
let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPRSrc_32_Lo128:$src1);
}
def VOP_MADMK_F32 : VOP_MADMK <f32>;

// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
// and processing time but it makes it easier to convert to mad.
class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT>.ret:$src2);
let Ins64 = getIns64<Src0RC64, Src1RC64, getVregSrcForVT<Src2VT>.ret, 3,
// Src2 must accept the same operand types as vdst, namely VGPRs only
let Src2RC64 = getVOP3VRegSrcForVT<Src2VT, IsTrue16, !not(IsRealTrue16)>.ret;
let Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, 3,
0, HasModifiers, HasModifiers, HasOMod,
Src0Mod, Src1Mod, Src2Mod>.ret;
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Expand Down Expand Up @@ -464,21 +467,18 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
}

def VOP_MAC_F16 : VOP_MAC <f16>;
def VOP_MAC_F16_t16 : VOP_MAC <f16> {
def VOP_MAC_F16_fake16 : VOP_MAC <f16> {
let IsTrue16 = 1;
let HasOpSel = 1;
let AsmVOP3OpSel = getAsmVOP3OpSel<2/*NumSrcArgs*/, HasClamp, HasOMod,
HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret;
let DstRC = VOPDstOperand<VGPR_32_Lo128>;
let DstRC64 = VOPDstOperand<VGPR_32>;
let Src1RC32 = VGPRSrc_32_Lo128;
let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2);
let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret;
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
Expand All @@ -488,10 +488,18 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
dpp8:$dpp8, Dpp8FI:$fi);
let Src2Mod = FP32InputMods; // dummy unused modifiers
let Src2RC64 = VGPRSrc_32; // stub argument
let DstRC64 = getVALUDstForVT<DstVT>.ret;
let Src0VOP3DPP = VGPRSrc_32;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 1/*IsFake16*/>.ret;
let Src0Mod = getSrc0Mod<Src0VT, DstVT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src1Mod = getSrcMod<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src2Mod = getSrcMod<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
}

def VOP_MAC_F32 : VOP_MAC <f32>;
let HasExtDPP = 0, HasExt32BitDPP = 0 in
def VOP_MAC_LEGACY_F32 : VOP_MAC <f32>;
Expand Down Expand Up @@ -650,15 +658,18 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
}

def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>;
def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>;
// V_CNDMASK_B16 is VOP3 only
def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
let IsTrue16 = 1;
let DstRC64 = getVALUDstForVT<DstVT>.ret;

let Src0Mod = getSrcMod<f16>.ret;
let Src1Mod = getSrcMod<f16>.ret;
let Src0Mod = getSrc0Mod<f16, DstVT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src1Mod = getSrcMod<f16, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;

let Src0VOP3DPP = VGPRSrc_32;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 1/*IsFake16*/>.ret;
let Src1ModVOP3DPP = getSrcModVOP3DPP<f16, 1/*IsFake16*/>.ret;
}

Expand Down Expand Up @@ -924,7 +935,6 @@ let FPDPRounding = 1 in {
let SubtargetPredicate = UseFakeTrue16Insts in
defm V_LDEXP_F16_fake16 : VOP2Inst <"v_ldexp_f16_fake16", LDEXP_F16_VOPProfile_Fake16, null_frag, "v_ldexp_f16_fake16">;
} // End FPDPRounding = 1
// FIXME VOP3 Only instructions. NFC using VOPProfile_True16 for these until a planned change to use a new register class for VOP3 encoded True16 instuctions
defm V_LSHLREV_B16 : VOP2Inst_e64_t16 <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>;
defm V_LSHRREV_B16 : VOP2Inst_e64_t16 <"v_lshrrev_b16", VOP_I16_I16_I16, clshr_rev_16>;
defm V_ASHRREV_I16 : VOP2Inst_e64_t16 <"v_ashrrev_i16", VOP_I16_I16_I16, cashr_rev_16>;
Expand Down Expand Up @@ -986,18 +996,18 @@ let SubtargetPredicate = isGFX11Plus in {

let FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1 in {
let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
}
let SubtargetPredicate = HasTrue16BitInsts in {
def V_FMAMK_F16_t16 : VOP2_Pseudo <"v_fmamk_f16_t16", VOP_MADMK_F16_t16, [], "">;
let True16Predicate = UseFakeTrue16Insts in {
def V_FMAMK_F16_fake16 : VOP2_Pseudo <"v_fmamk_f16_fake16", VOP_MADMK_F16_fake16, [], "">;
}

let isCommutable = 1 in {
let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
}
let SubtargetPredicate = HasTrue16BitInsts in {
def V_FMAAK_F16_t16 : VOP2_Pseudo <"v_fmaak_f16_t16", VOP_MADAK_F16_t16, [], "">;
let True16Predicate = UseFakeTrue16Insts in {
def V_FMAAK_F16_fake16 : VOP2_Pseudo <"v_fmaak_f16_fake16", VOP_MADAK_F16_fake16, [], "">;
}
} // End isCommutable = 1
} // End FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1
Expand All @@ -1006,22 +1016,24 @@ let Constraints = "$vdst = $src2",
DisableEncoding="$src2",
isConvertibleToThreeAddress = 1,
isCommutable = 1 in {
let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>;
let SubtargetPredicate = isGFX10Plus in {
let True16Predicate = NotHasTrue16BitInsts in {
defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>;
}
let SubtargetPredicate = HasTrue16BitInsts in {
defm V_FMAC_F16_t16 : VOP2Inst <"v_fmac_f16_t16", VOP_MAC_F16_t16>;
let True16Predicate = UseFakeTrue16Insts in {
defm V_FMAC_F16_fake16 : VOP2Inst <"v_fmac_f16_fake16", VOP_MAC_F16_fake16>;
}
} // End SubtargetPredicate = isGFX10Plus
} // End FMAC Constraints

let SubtargetPredicate = Has16BitInsts in {
let isReMaterializable = 1 in {
let FPDPRounding = 1 in {
def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
} // End FPDPRounding = 1
let isCommutable = 1 in {
let mayRaiseFPException = 0 in {
def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
}
let SubtargetPredicate = isGFX8GFX9 in {
defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>;
Expand Down Expand Up @@ -1576,14 +1588,20 @@ multiclass VOP2_Real_FULL_with_name_gfx12<bits<6> op, string opName,
string asmName> :
VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;

multiclass VOP2_Real_FULL_t16_with_name_gfx12<bits<6> op, string opName,
string asmName, string alias> {
multiclass VOP2_Real_FULL_t16_gfx12<bits<6> op, string opName,
string asmName, string alias> {
defm NAME : VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
def _gfx12_2nd_alias : AMDGPUMnemonicAlias<alias, asmName> {
let AssemblerPredicate = isGFX12Only;
}
}

multiclass VOP2_Real_FULL_t16_and_fake16_gfx12<bits<6> op, string opName,
string asmName, string alias> {
defm _t16: VOP2_Real_FULL_t16_gfx12<op, opName#"_t16", asmName, alias>;
defm _fake16: VOP2_Real_FULL_t16_gfx12<op, opName#"_fake16", asmName, alias>;
}

multiclass VOP2_Real_NO_DPP_with_name_gfx12<bits<6> op, string opName,
string asmName> :
VOP2_Real_NO_DPP_with_name<GFX12Gen, op, opName, asmName>;
Expand All @@ -1607,10 +1625,8 @@ defm V_SUBREV_CO_CI_U32 :

defm V_MIN_NUM_F32 : VOP2_Real_FULL_with_name_gfx12<0x015, "V_MIN_F32", "v_min_num_f32">;
defm V_MAX_NUM_F32 : VOP2_Real_FULL_with_name_gfx12<0x016, "V_MAX_F32", "v_max_num_f32">;
defm V_MIN_NUM_F16 : VOP2_Real_FULL_t16_with_name_gfx12<0x030, "V_MIN_F16_t16", "v_min_num_f16", "v_min_f16">;
defm V_MIN_NUM_F16_fake16 : VOP2_Real_FULL_t16_with_name_gfx12<0x030, "V_MIN_F16_fake16", "v_min_num_f16", "v_min_f16">;
defm V_MAX_NUM_F16 : VOP2_Real_FULL_t16_with_name_gfx12<0x031, "V_MAX_F16_t16", "v_max_num_f16", "v_max_f16">;
defm V_MAX_NUM_F16_fake16 : VOP2_Real_FULL_t16_with_name_gfx12<0x031, "V_MAX_F16_fake16", "v_max_num_f16", "v_max_f16">;
defm V_MIN_NUM_F16 : VOP2_Real_FULL_t16_and_fake16_gfx12<0x030, "V_MIN_F16", "v_min_num_f16", "v_min_f16">;
defm V_MAX_NUM_F16 : VOP2_Real_FULL_t16_and_fake16_gfx12<0x031, "V_MAX_F16", "v_max_num_f16", "v_max_f16">;

let SubtargetPredicate = isGFX12Plus in {
defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx12>;
Expand Down Expand Up @@ -1645,6 +1661,14 @@ multiclass VOP2_Real_NO_VOP3_with_name_gfx11<bits<6> op, string opName,
}
}

multiclass VOP2_Real_FULL_t16_gfx11<bits<6> op, string asmName, string opName = NAME> :
VOP2_Real_FULL_with_name<GFX11Gen, op, opName, asmName>;

multiclass VOP2_Real_FULL_t16_and_fake16_gfx11<bits<6> op, string asmName, string opName = NAME> {
defm opName#"_t16": VOP2_Real_FULL_t16_gfx11<op, asmName, opName#"_t16">;
defm opName#"_fake16": VOP2_Real_FULL_t16_gfx11<op, asmName, opName#"_fake16">;
}

multiclass VOP2_Real_NO_DPP_with_name_gfx11<bits<6> op, string opName,
string asmName> :
VOP2_Real_NO_DPP_with_name<GFX11Gen, op, opName, asmName>;
Expand Down Expand Up @@ -1675,14 +1699,16 @@ multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string as
multiclass VOP3beOnly_Realtriple_gfx11_gfx12<bits<10> op> :
VOP3beOnly_Realtriple<GFX11Gen, op>, VOP3beOnly_Realtriple<GFX12Gen, op>;

multiclass VOP2Only_Real_MADK_with_name_gfx11_gfx12<bits<6> op, string asmName,
string opName = NAME> :
multiclass VOP2Only_Real_MADK_t16_gfx11_gfx12<bits<6> op, string asmName,
string opName = NAME> :
VOP2Only_Real_MADK_with_name<GFX11Gen, op, asmName, opName>,
VOP2Only_Real_MADK_with_name<GFX12Gen, op, asmName, opName>;

multiclass VOP2_Real_FULL_t16_gfx11<bits<6> op, string asmName,
string opName = NAME> :
VOP2_Real_FULL_with_name<GFX11Gen, op, opName, asmName>;
multiclass VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<bits<6> op, string asmName,
string opName = NAME> {
defm _t16: VOP2Only_Real_MADK_t16_gfx11_gfx12<op, asmName, opName#"_t16">;
defm _fake16: VOP2Only_Real_MADK_t16_gfx11_gfx12<op, asmName, opName#"_fake16">;
}

multiclass VOP2_Real_FULL_t16_gfx11_gfx12<bits<6> op, string asmName,
string opName = NAME> :
Expand Down Expand Up @@ -1721,15 +1747,15 @@ defm V_SUBREV_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16
defm V_SUBREV_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16">;
defm V_MUL_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
defm V_MUL_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
defm V_FMAC_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">;
defm V_FMAC_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">;
defm V_LDEXP_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
defm V_LDEXP_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
defm V_MAX_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
defm V_MAX_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
defm V_MIN_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;
defm V_MIN_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;
defm V_FMAMK_F16_t16 : VOP2Only_Real_MADK_with_name_gfx11_gfx12<0x037, "v_fmamk_f16">;
defm V_FMAAK_F16_t16 : VOP2Only_Real_MADK_with_name_gfx11_gfx12<0x038, "v_fmaak_f16">;
defm V_FMAMK_F16_fake16 : VOP2Only_Real_MADK_t16_gfx11_gfx12<0x037, "v_fmamk_f16">;
defm V_FMAAK_F16_fake16 : VOP2Only_Real_MADK_t16_gfx11_gfx12<0x038, "v_fmaak_f16">;

// VOP3 only.
defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11_gfx12<0x25d>;
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/DirectX/DXIL.td
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,16 @@ def SplitDouble : DXILOp<102, splitDouble> {
let attributes = [Attributes<DXIL1_0, [ReadNone]>];
}

def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
let Doc = "signed dot product of 4 x i8 vectors packed into i32, with "
"accumulate to i32";
let LLVMIntrinsic = int_dx_dot4add_i8packed;
let arguments = [Int32Ty, Int32Ty, Int32Ty];
let result = Int32Ty;
let attributes = [Attributes<DXIL1_0, [ReadNone]>];
let stages = [Stages<DXIL1_0, [all_stages]>];
}

def AnnotateHandle : DXILOp<216, annotateHandle> {
let Doc = "annotate handle with resource properties";
let arguments = [HandleTy, ResPropsTy];
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)

auto &AbsActions = getActionDefinitionsBuilder(G_ABS);
if (ST.hasStdExtZbb())
AbsActions.customFor({s32, sXLen}).minScalar(0, sXLen);
AbsActions.customFor({sXLen}).minScalar(0, sXLen);
AbsActions.lower();

auto &MinMaxActions =
Expand Down
234 changes: 199 additions & 35 deletions llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,25 @@ getRVVCalleeSavedInfo(const MachineFunction &MF,
return RVVCSI;
}

static SmallVector<CalleeSavedInfo, 8>
getPushOrLibCallsSavedInfo(const MachineFunction &MF,
const std::vector<CalleeSavedInfo> &CSI) {
auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();

SmallVector<CalleeSavedInfo, 8> PushPopOrLibCallsCSI;
if (!RVFI->useSaveRestoreLibCalls(MF) && !RVFI->isPushable(MF))
return PushPopOrLibCallsCSI;

for (auto &CS : CSI) {
const auto *FII = llvm::find_if(
FixedCSRFIMap, [&](auto P) { return P.first == CS.getReg(); });
if (FII != std::end(FixedCSRFIMap))
PushPopOrLibCallsCSI.push_back(CS);
}

return PushPopOrLibCallsCSI;
}

void RISCVFrameLowering::adjustStackForRVV(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
Expand Down Expand Up @@ -557,6 +576,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
// Determine the correct frame layout
determineFrameLayout(MF);

const auto &CSI = MFI.getCalleeSavedInfo();

// If libcalls are used to spill and restore callee-saved registers, the frame
// has two sections; the opaque section managed by the libcalls, and the
// section managed by MachineFrameInfo which can also hold callee saved
Expand All @@ -582,6 +603,23 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
unsigned LibCallFrameSize =
alignTo((STI.getXLen() / 8) * LibCallRegs, getStackAlign());
RVFI->setLibCallStackSize(LibCallFrameSize);

unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::cfiDefCfaOffset(nullptr, LibCallFrameSize));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);

for (const auto &Entry : getPushOrLibCallsSavedInfo(MF, CSI)) {
int FrameIdx = Entry.getFrameIdx();
int64_t Offset = MFI.getObjectOffset(FrameIdx);
Register Reg = Entry.getReg();
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
nullptr, RI->getDwarfRegNum(Reg, true), Offset));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);
}
}

// FIXME (note copied from Lanai): This appears to be overallocating. Needs
Expand Down Expand Up @@ -616,23 +654,38 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
uint64_t Spimm = std::min(alignDown(StackSize, 16), (uint64_t)48);
FirstFrameSetup->getOperand(1).setImm(Spimm);
StackSize -= Spimm;

unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize - StackSize));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);

for (const auto &Entry : getPushOrLibCallsSavedInfo(MF, CSI)) {
int FrameIdx = Entry.getFrameIdx();
int64_t Offset = MFI.getObjectOffset(FrameIdx);
Register Reg = Entry.getReg();
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
nullptr, RI->getDwarfRegNum(Reg, true), Offset));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);
}
}

if (StackSize != 0) {
// Allocate space on the stack if necessary.
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
StackOffset::getFixed(-StackSize), MachineInstr::FrameSetup,
getStackAlign());
}

// Emit ".cfi_def_cfa_offset RealStackSize"
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);

const auto &CSI = MFI.getCalleeSavedInfo();
// Emit ".cfi_def_cfa_offset RealStackSize"
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);
}

// The frame pointer is callee-saved, and code has been generated for us to
// save it to the stack. We need to skip over the storing of callee-saved
Expand All @@ -644,7 +697,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,

// Iterate over list of callee-saved registers and emit .cfi_offset
// directives.
for (const auto &Entry : CSI) {
for (const auto &Entry : getUnmanagedCSI(MF, CSI)) {
int FrameIdx = Entry.getFrameIdx();
if (FrameIdx >= 0 &&
MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
Expand Down Expand Up @@ -759,19 +812,29 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
void RISCVFrameLowering::deallocateStack(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, uint64_t StackSize,
const DebugLoc &DL,
uint64_t &StackSize,
int64_t CFAOffset) const {
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
const RISCVInstrInfo *TII = STI.getInstrInfo();

RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(StackSize),
MachineInstr::FrameDestroy, getStackAlign());
StackSize = 0;

unsigned CFIIndex =
MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameDestroy);
}

void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
MachineFrameInfo &MFI = MF.getFrameInfo();
auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
const RISCVInstrInfo *TII = STI.getInstrInfo();

// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
Expand Down Expand Up @@ -814,15 +877,25 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
RVFI->getVarArgsSaveSize();
uint64_t RVVStackSize = RVFI->getRVVStackSize();

bool RestoreFP = RI->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
!hasReservedCallFrame(MF);

bool RestoreSPFromFP = RI->hasStackRealignment(MF) ||
MFI.hasVarSizedObjects() || !hasReservedCallFrame(MF);
if (RVVStackSize) {
// If RestoreFP the stack pointer will be restored using the frame pointer
// value.
if (!RestoreFP)
// If RestoreSPFromFP the stack pointer will be restored using the frame
// pointer value.
if (!RestoreSPFromFP)
adjustStackForRVV(MF, MBB, LastFrameDestroy, DL, RVVStackSize,
MachineInstr::FrameDestroy);

if (!hasFP(MF)) {
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
nullptr, RI->getDwarfRegNum(SPReg, true), RealStackSize));
BuildMI(MBB, LastFrameDestroy, DL,
TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameDestroy);
}

emitCalleeSavedRVVEpilogCFI(MBB, LastFrameDestroy);
}

if (FirstSPAdjustAmount) {
Expand All @@ -831,12 +904,21 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
assert(SecondSPAdjustAmount > 0 &&
"SecondSPAdjustAmount should be greater than zero");

// If RestoreFP the stack pointer will be restored using the frame pointer
// value.
if (!RestoreFP)
// If RestoreSPFromFP the stack pointer will be restored using the frame
// pointer value.
if (!RestoreSPFromFP)
RI->adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg,
StackOffset::getFixed(SecondSPAdjustAmount),
MachineInstr::FrameDestroy, getStackAlign());

if (!hasFP(MF)) {
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::cfiDefCfaOffset(nullptr, FirstSPAdjustAmount));
BuildMI(MBB, LastFrameDestroy, DL,
TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameDestroy);
}
}

// Restore the stack pointer using the value of the frame pointer. Only
Expand All @@ -849,14 +931,44 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
// normally it's just checking the variable sized object is present or not
// is enough, but we also don't preserve that at prologue/epilogue when
// have vector objects in stack.
if (RestoreFP) {
if (RestoreSPFromFP) {
assert(hasFP(MF) && "frame pointer should not have been eliminated");

RI->adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg,
StackOffset::getFixed(-FPOffset), MachineInstr::FrameDestroy,
getStackAlign());
}

if (hasFP(MF)) {
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
nullptr, RI->getDwarfRegNum(SPReg, true), RealStackSize));
BuildMI(MBB, LastFrameDestroy, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameDestroy);
}

if (getLibCallID(MF, CSI) != -1) {
// tail __riscv_restore_[0-12] instruction is considered as a terminator,
// therefor it is unnecessary to place any CFI instructions after it. Just
// deallocate stack if needed and return.
if (StackSize != 0)
deallocateStack(MF, MBB, MBBI, DL, StackSize,
RVFI->getLibCallStackSize());

// Emit epilogue for shadow call stack.
emitSCSEpilogue(MF, MBB, MBBI, DL);
return;
}

// Recover callee-saved registers.
for (const auto &Entry : getUnmanagedCSI(MF, CSI)) {
Register Reg = Entry.getReg();
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
nullptr, RI->getDwarfRegNum(Reg, true)));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameDestroy);
}

bool ApplyPop = RVFI->isPushable(MF) && MBBI != MBB.end() &&
MBBI->getOpcode() == RISCV::CM_POP;
if (ApplyPop) {
Expand All @@ -872,12 +984,31 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
deallocateStack(MF, MBB, MBBI, DL, StackSize,
/*stack_adj of cm.pop instr*/ RealStackSize - StackSize);

++MBBI;
auto NextI = next_nodbg(MBBI, MBB.end());
if (NextI == MBB.end() || NextI->getOpcode() != RISCV::PseudoRET) {
++MBBI;

for (const auto &Entry : getPushOrLibCallsSavedInfo(MF, CSI)) {
Register Reg = Entry.getReg();
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
nullptr, RI->getDwarfRegNum(Reg, true)));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameDestroy);
}

// Update CFA offset. After CM_POP SP should be equal to CFA, so CFA
// offset should be a zero.
unsigned CFIIndex =
MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameDestroy);
}
}

// Deallocate stack if we didn't already do it during cm.pop handling and
// StackSize isn't a zero
if (StackSize != 0 && !ApplyPop)
// Deallocate stack if StackSize isn't a zero yet
if (StackSize != 0)
deallocateStack(MF, MBB, MBBI, DL, StackSize, 0);

// Emit epilogue for shadow call stack.
Expand Down Expand Up @@ -1564,6 +1695,23 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
return true;
}

static unsigned getCaleeSavedRVVNumRegs(const Register &BaseReg) {
return RISCV::VRRegClass.contains(BaseReg) ? 1
: RISCV::VRM2RegClass.contains(BaseReg) ? 2
: RISCV::VRM4RegClass.contains(BaseReg) ? 4
: 8;
}

static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI,
const Register &Reg) {
MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0);
// If it's not a grouped vector register, it doesn't have subregister, so
// the base register is just itself.
if (BaseReg == RISCV::NoRegister)
BaseReg = Reg;
return BaseReg;
}

void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, bool HasFP) const {
MachineFunction *MF = MBB.getParent();
Expand All @@ -1590,15 +1738,8 @@ void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
// Insert the spill to the stack frame.
int FI = CS.getFrameIdx();
if (FI >= 0 && MFI.getStackID(FI) == TargetStackID::ScalableVector) {
MCRegister BaseReg = TRI.getSubReg(CS.getReg(), RISCV::sub_vrm1_0);
// If it's not a grouped vector register, it doesn't have subregister, so
// the base register is just itself.
if (BaseReg == RISCV::NoRegister)
BaseReg = CS.getReg();
unsigned NumRegs = RISCV::VRRegClass.contains(CS.getReg()) ? 1
: RISCV::VRM2RegClass.contains(CS.getReg()) ? 2
: RISCV::VRM4RegClass.contains(CS.getReg()) ? 4
: 8;
MCRegister BaseReg = getRVVBaseRegister(TRI, CS.getReg());
unsigned NumRegs = getCaleeSavedRVVNumRegs(CS.getReg());
for (unsigned i = 0; i < NumRegs; ++i) {
unsigned CFIIndex = MF->addFrameInst(createDefCFAOffset(
TRI, BaseReg + i, -FixedSize, MFI.getObjectOffset(FI) / 8 + i));
Expand All @@ -1610,6 +1751,29 @@ void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
}
}

void RISCVFrameLowering::emitCalleeSavedRVVEpilogCFI(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const {
MachineFunction *MF = MBB.getParent();
const MachineFrameInfo &MFI = MF->getFrameInfo();
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
const TargetInstrInfo &TII = *STI.getInstrInfo();
const RISCVRegisterInfo &TRI = *STI.getRegisterInfo();
DebugLoc DL = MBB.findDebugLoc(MI);

const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, MFI.getCalleeSavedInfo());
for (auto &CS : RVVCSI) {
MCRegister BaseReg = getRVVBaseRegister(TRI, CS.getReg());
unsigned NumRegs = getCaleeSavedRVVNumRegs(CS.getReg());
for (unsigned i = 0; i < NumRegs; ++i) {
unsigned CFIIndex = MF->addFrameInst(MCCFIInstruction::createRestore(
nullptr, RI->getDwarfRegNum(BaseReg + i, true)));
BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameDestroy);
}
}
}

bool RISCVFrameLowering::restoreCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/RISCV/RISCVFrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,11 @@ class RISCVFrameLowering : public TargetFrameLowering {
void emitCalleeSavedRVVPrologCFI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
bool HasFP) const;
void emitCalleeSavedRVVEpilogCFI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const;
void deallocateStack(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
uint64_t StackSize, int64_t CFAOffset) const;
uint64_t &StackSize, int64_t CFAOffset) const;

std::pair<int64_t, Align>
assignRVVStackObjectOffsets(MachineFunction &MF) const;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,

if (TT.isOSFuchsia() && !TT.isArch64Bit())
report_fatal_error("Fuchsia is only supported for 64-bit");

setCFIFixup(true);
}

const RISCVSubtarget *
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ static const std::map<std::string, SPIRV::Extension::Extension>
SPIRV::Extension::Extension::SPV_KHR_expect_assume},
{"SPV_KHR_bit_instructions",
SPIRV::Extension::Extension::SPV_KHR_bit_instructions},
{"SPV_KHR_integer_dot_product",
SPIRV::Extension::Extension::SPV_KHR_integer_dot_product},
{"SPV_KHR_linkonce_odr",
SPIRV::Extension::Extension::SPV_KHR_linkonce_odr},
{"SPV_INTEL_inline_assembly",
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,9 @@ defm OpISubBorrow: BinOpTypedGen<"OpISubBorrow", 150, subc, 0, 1>;
def OpUMulExtended: BinOp<"OpUMulExtended", 151>;
def OpSMulExtended: BinOp<"OpSMulExtended", 152>;

def OpSDot: BinOp<"OpSDot", 4450>;
def OpUDot: BinOp<"OpUDot", 4451>;

// 3.42.14 Bit Instructions

defm OpShiftRightLogical: BinOpTypedGen<"OpShiftRightLogical", 194, srl, 0, 1>;
Expand Down
132 changes: 125 additions & 7 deletions llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,13 @@ class SPIRVInstructionSelector : public InstructionSelector {
bool selectIntegerDot(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;

template <bool Signed>
bool selectDot4AddPacked(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
template <bool Signed>
bool selectDot4AddPackedExpansion(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;

void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
int OpIdx) const;
void renderFImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
Expand Down Expand Up @@ -1646,7 +1653,7 @@ bool SPIRVInstructionSelector::selectIntegerDot(Register ResVReg,
// Multiply the vectors, then sum the results
Register Vec0 = I.getOperand(2).getReg();
Register Vec1 = I.getOperand(3).getReg();
Register TmpVec = MRI->createVirtualRegister(&SPIRV::IDRegClass);
Register TmpVec = MRI->createVirtualRegister(GR.getRegClass(ResType));
SPIRVType *VecType = GR.getSPIRVTypeForVReg(Vec0);

bool Result = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIMulV))
Expand All @@ -1660,18 +1667,18 @@ bool SPIRVInstructionSelector::selectIntegerDot(Register ResVReg,
GR.getScalarOrVectorComponentCount(VecType) > 1 &&
"dot product requires a vector of at least 2 components");

Register Res = MRI->createVirtualRegister(&SPIRV::IDRegClass);
Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract))
Register Res = MRI->createVirtualRegister(GR.getRegClass(ResType));
Result &= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract))
.addDef(Res)
.addUse(GR.getSPIRVTypeID(ResType))
.addUse(TmpVec)
.addImm(0)
.constrainAllUses(TII, TRI, RBI);

for (unsigned i = 1; i < GR.getScalarOrVectorComponentCount(VecType); i++) {
Register Elt = MRI->createVirtualRegister(&SPIRV::IDRegClass);
Register Elt = MRI->createVirtualRegister(GR.getRegClass(ResType));

Result |=
Result &=
BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract))
.addDef(Elt)
.addUse(GR.getSPIRVTypeID(ResType))
Expand All @@ -1680,10 +1687,10 @@ bool SPIRVInstructionSelector::selectIntegerDot(Register ResVReg,
.constrainAllUses(TII, TRI, RBI);

Register Sum = i < GR.getScalarOrVectorComponentCount(VecType) - 1
? MRI->createVirtualRegister(&SPIRV::IDRegClass)
? MRI->createVirtualRegister(GR.getRegClass(ResType))
: ResVReg;

Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIAddS))
Result &= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIAddS))
.addDef(Sum)
.addUse(GR.getSPIRVTypeID(ResType))
.addUse(Res)
Expand All @@ -1695,6 +1702,112 @@ bool SPIRVInstructionSelector::selectIntegerDot(Register ResVReg,
return Result;
}

template <bool Signed>
bool SPIRVInstructionSelector::selectDot4AddPacked(Register ResVReg,
const SPIRVType *ResType,
MachineInstr &I) const {
assert(I.getNumOperands() == 5);
assert(I.getOperand(2).isReg());
assert(I.getOperand(3).isReg());
assert(I.getOperand(4).isReg());
MachineBasicBlock &BB = *I.getParent();

auto DotOp = Signed ? SPIRV::OpSDot : SPIRV::OpUDot;
Register Dot = MRI->createVirtualRegister(GR.getRegClass(ResType));
bool Result = BuildMI(BB, I, I.getDebugLoc(), TII.get(DotOp))
.addDef(Dot)
.addUse(GR.getSPIRVTypeID(ResType))
.addUse(I.getOperand(2).getReg())
.addUse(I.getOperand(3).getReg())
.constrainAllUses(TII, TRI, RBI);

Result &= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIAddS))
.addDef(ResVReg)
.addUse(GR.getSPIRVTypeID(ResType))
.addUse(Dot)
.addUse(I.getOperand(4).getReg())
.constrainAllUses(TII, TRI, RBI);

return Result;
}

// Since pre-1.6 SPIRV has no DotProductInput4x8BitPacked implementation,
// extract the elements of the packed inputs, multiply them and add the result
// to the accumulator.
template <bool Signed>
bool SPIRVInstructionSelector::selectDot4AddPackedExpansion(
Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const {
assert(I.getNumOperands() == 5);
assert(I.getOperand(2).isReg());
assert(I.getOperand(3).isReg());
assert(I.getOperand(4).isReg());
MachineBasicBlock &BB = *I.getParent();

bool Result = false;

// Acc = C
Register Acc = I.getOperand(4).getReg();
SPIRVType *EltType = GR.getOrCreateSPIRVIntegerType(8, I, TII);
auto ExtractOp =
Signed ? SPIRV::OpBitFieldSExtract : SPIRV::OpBitFieldUExtract;

// Extract the i8 element, multiply and add it to the accumulator
for (unsigned i = 0; i < 4; i++) {
// A[i]
Register AElt = MRI->createVirtualRegister(&SPIRV::IDRegClass);
Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
.addDef(AElt)
.addUse(GR.getSPIRVTypeID(ResType))
.addUse(I.getOperand(2).getReg())
.addUse(GR.getOrCreateConstInt(i * 8, I, EltType, TII))
.addUse(GR.getOrCreateConstInt(8, I, EltType, TII))
.constrainAllUses(TII, TRI, RBI);

// B[i]
Register BElt = MRI->createVirtualRegister(&SPIRV::IDRegClass);
Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
.addDef(BElt)
.addUse(GR.getSPIRVTypeID(ResType))
.addUse(I.getOperand(3).getReg())
.addUse(GR.getOrCreateConstInt(i * 8, I, EltType, TII))
.addUse(GR.getOrCreateConstInt(8, I, EltType, TII))
.constrainAllUses(TII, TRI, RBI);

// A[i] * B[i]
Register Mul = MRI->createVirtualRegister(&SPIRV::IDRegClass);
Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIMulS))
.addDef(Mul)
.addUse(GR.getSPIRVTypeID(ResType))
.addUse(AElt)
.addUse(BElt)
.constrainAllUses(TII, TRI, RBI);

// Discard 24 highest-bits so that stored i32 register is i8 equivalent
Register MaskMul = MRI->createVirtualRegister(&SPIRV::IDRegClass);
Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(ExtractOp))
.addDef(MaskMul)
.addUse(GR.getSPIRVTypeID(ResType))
.addUse(Mul)
.addUse(GR.getOrCreateConstInt(0, I, EltType, TII))
.addUse(GR.getOrCreateConstInt(8, I, EltType, TII))
.constrainAllUses(TII, TRI, RBI);

// Acc = Acc + A[i] * B[i]
Register Sum =
i < 3 ? MRI->createVirtualRegister(&SPIRV::IDRegClass) : ResVReg;
Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIAddS))
.addDef(Sum)
.addUse(GR.getSPIRVTypeID(ResType))
.addUse(Acc)
.addUse(MaskMul)
.constrainAllUses(TII, TRI, RBI);

Acc = Sum;
}

return Result;
}

/// Transform saturate(x) to clamp(x, 0.0f, 1.0f) as SPIRV
/// does not have a saturate builtin.
bool SPIRVInstructionSelector::selectSaturate(Register ResVReg,
Expand Down Expand Up @@ -2528,6 +2641,11 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
case Intrinsic::spv_udot:
case Intrinsic::spv_sdot:
return selectIntegerDot(ResVReg, ResType, I);
case Intrinsic::spv_dot4add_i8packed:
if (STI.canUseExtension(SPIRV::Extension::SPV_KHR_integer_dot_product) ||
STI.isAtLeastSPIRVVer(VersionTuple(1, 6)))
return selectDot4AddPacked<true>(ResVReg, ResType, I);
return selectDot4AddPackedExpansion<true>(ResVReg, ResType, I);
case Intrinsic::spv_all:
return selectAll(ResVReg, ResType, I);
case Intrinsic::spv_any:
Expand Down
67 changes: 51 additions & 16 deletions llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,22 @@ void SPIRV::RequirementHandler::removeCapabilityIf(
namespace llvm {
namespace SPIRV {
void RequirementHandler::initAvailableCapabilities(const SPIRVSubtarget &ST) {
// Provided by both all supported Vulkan versions and OpenCl.
addAvailableCaps({Capability::Shader, Capability::Linkage, Capability::Int8,
Capability::Int16});

if (ST.isAtLeastSPIRVVer(VersionTuple(1, 6)))
addAvailableCaps({Capability::DotProduct, Capability::DotProductInputAll,
Capability::DotProductInput4x8Bit,
Capability::DotProductInput4x8BitPacked});

// Add capabilities enabled by extensions.
for (auto Extension : ST.getAllAvailableExtensions()) {
CapabilityList EnabledCapabilities =
getCapabilitiesEnabledByExtension(Extension);
addAvailableCaps(EnabledCapabilities);
}

if (ST.isOpenCLEnv()) {
initAvailableCapabilitiesForOpenCL(ST);
return;
Expand All @@ -643,10 +659,8 @@ void RequirementHandler::initAvailableCapabilitiesForOpenCL(
const SPIRVSubtarget &ST) {
// Add the min requirements for different OpenCL and SPIR-V versions.
addAvailableCaps({Capability::Addresses, Capability::Float16Buffer,
Capability::Int16, Capability::Int8, Capability::Kernel,
Capability::Linkage, Capability::Vector16,
Capability::Groups, Capability::GenericPointer,
Capability::Shader});
Capability::Kernel, Capability::Vector16,
Capability::Groups, Capability::GenericPointer});
if (ST.hasOpenCLFullProfile())
addAvailableCaps({Capability::Int64, Capability::Int64Atomics});
if (ST.hasOpenCLImageSupport()) {
Expand Down Expand Up @@ -675,25 +689,16 @@ void RequirementHandler::initAvailableCapabilitiesForOpenCL(
// TODO: verify if this needs some checks.
addAvailableCaps({Capability::Float16, Capability::Float64});

// Add capabilities enabled by extensions.
for (auto Extension : ST.getAllAvailableExtensions()) {
CapabilityList EnabledCapabilities =
getCapabilitiesEnabledByExtension(Extension);
addAvailableCaps(EnabledCapabilities);
}

// TODO: add OpenCL extensions.
}

void RequirementHandler::initAvailableCapabilitiesForVulkan(
const SPIRVSubtarget &ST) {
addAvailableCaps({Capability::Shader, Capability::Linkage});

// Core in Vulkan 1.1 and earlier.
addAvailableCaps({Capability::Int16, Capability::Int64, Capability::Float16,
Capability::Float64, Capability::GroupNonUniform,
Capability::Image1D, Capability::SampledBuffer,
Capability::ImageBuffer,
addAvailableCaps({Capability::Int64, Capability::Float16, Capability::Float64,
Capability::GroupNonUniform, Capability::Image1D,
Capability::SampledBuffer, Capability::ImageBuffer,
Capability::UniformBufferArrayDynamicIndexing,
Capability::SampledImageArrayDynamicIndexing,
Capability::StorageBufferArrayDynamicIndexing,
Expand Down Expand Up @@ -1000,6 +1005,32 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
}
}

static void AddDotProductRequirements(const MachineInstr &MI,
SPIRV::RequirementHandler &Reqs,
const SPIRVSubtarget &ST) {
if (ST.canUseExtension(SPIRV::Extension::SPV_KHR_integer_dot_product))
Reqs.addExtension(SPIRV::Extension::SPV_KHR_integer_dot_product);
Reqs.addCapability(SPIRV::Capability::DotProduct);

const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
const MachineInstr *InstrPtr = &MI;
assert(MI.getOperand(1).isReg() && "Unexpected operand in dot");

Register TypeReg = InstrPtr->getOperand(1).getReg();
SPIRVType *TypeDef = MRI.getVRegDef(TypeReg);
if (TypeDef->getOpcode() == SPIRV::OpTypeInt) {
assert(TypeDef->getOperand(1).getImm() == 32);
Reqs.addCapability(SPIRV::Capability::DotProductInput4x8BitPacked);
} else if (TypeDef->getOpcode() == SPIRV::OpTypeVector) {
SPIRVType *ScalarTypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg());
assert(ScalarTypeDef->getOpcode() == SPIRV::OpTypeInt);
auto Capability = ScalarTypeDef->getOperand(1).getImm() == 8
? SPIRV::Capability::DotProductInput4x8Bit
: SPIRV::Capability::DotProductInputAll;
Reqs.addCapability(Capability);
}
}

void addInstrRequirements(const MachineInstr &MI,
SPIRV::RequirementHandler &Reqs,
const SPIRVSubtarget &ST) {
Expand Down Expand Up @@ -1376,6 +1407,10 @@ void addInstrRequirements(const MachineInstr &MI,
Reqs.addCapability(SPIRV::Capability::SplitBarrierINTEL);
}
break;
case SPIRV::OpSDot:
case SPIRV::OpUDot:
AddDotProductRequirements(MI, Reqs, ST);
break;
default:
break;
}
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,6 @@ bool SPIRVSubtarget::canDirectlyComparePointers() const {

void SPIRVSubtarget::initAvailableExtensions() {
AvailableExtensions.clear();
if (!isOpenCLEnv())
return;

AvailableExtensions.insert(Extensions.begin(), Extensions.end());
}

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,10 @@ defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []
defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>;
defm IndirectReferencesINTEL : CapabilityOperand<5604, 0, 0, [SPV_INTEL_function_pointers], []>;
defm AsmINTEL : CapabilityOperand<5606, 0, 0, [SPV_INTEL_inline_assembly], []>;
defm DotProductInputAll : CapabilityOperand<6016, 0x10600, 0, [SPV_KHR_integer_dot_product], []>;
defm DotProductInput4x8Bit : CapabilityOperand<6017, 0x10600, 0, [SPV_KHR_integer_dot_product], [Int8]>;
defm DotProductInput4x8BitPacked : CapabilityOperand<6018, 0x10600, 0, [SPV_KHR_integer_dot_product], []>;
defm DotProduct : CapabilityOperand<6019, 0x10600, 0, [SPV_KHR_integer_dot_product], []>;
defm GroupNonUniformRotateKHR : CapabilityOperand<6026, 0, 0, [SPV_KHR_subgroup_rotate], [GroupNonUniform]>;
defm AtomicFloat32AddEXT : CapabilityOperand<6033, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
defm AtomicFloat64AddEXT : CapabilityOperand<6034, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
Expand Down
96 changes: 96 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "MCTargetDesc/SystemZGNUInstPrinter.h"
#include "MCTargetDesc/SystemZHLASMInstPrinter.h"
#include "MCTargetDesc/SystemZMCExpr.h"
#include "MCTargetDesc/SystemZMCTargetDesc.h"
#include "SystemZConstantPoolValue.h"
#include "SystemZMCInstLower.h"
#include "TargetInfo/SystemZTargetInfo.h"
Expand Down Expand Up @@ -662,6 +663,23 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
LowerPATCHPOINT(*MI, Lower);
return;

case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
LowerPATCHABLE_FUNCTION_ENTER(*MI, Lower);
return;

case TargetOpcode::PATCHABLE_RET:
LowerPATCHABLE_RET(*MI, Lower);
return;

case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
llvm_unreachable("PATCHABLE_FUNCTION_EXIT should never be emitted");

case TargetOpcode::PATCHABLE_TAIL_CALL:
// TODO: Define a trampoline `__xray_FunctionTailExit` and differentiate a
// normal function exit from a tail exit.
llvm_unreachable("Tail call is handled in the normal case. See comments "
"around this assert.");

case SystemZ::EXRL_Pseudo: {
unsigned TargetInsOpc = MI->getOperand(0).getImm();
Register LenMinus1Reg = MI->getOperand(1).getReg();
Expand Down Expand Up @@ -844,6 +862,84 @@ void SystemZAsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
getSubtargetInfo());
}

void SystemZAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(
const MachineInstr &MI, SystemZMCInstLower &Lower) {
// .begin:
// j .end # -> stmg %r2, %r15, 16(%r15)
// nop
// llilf %2, FuncID
// brasl %r14, __xray_FunctionEntry@GOT
// .end:
//
// Update compiler-rt/lib/xray/xray_s390x.cpp accordingly when number
// of instructions change.
bool HasVectorFeature =
TM.getMCSubtargetInfo()->hasFeature(SystemZ::FeatureVector) &&
!TM.getMCSubtargetInfo()->hasFeature(SystemZ::FeatureSoftFloat);
MCSymbol *FuncEntry = OutContext.getOrCreateSymbol(
HasVectorFeature ? "__xray_FunctionEntryVec" : "__xray_FunctionEntry");
MCSymbol *BeginOfSled = OutContext.createTempSymbol("xray_sled_", true);
MCSymbol *EndOfSled = OutContext.createTempSymbol();
OutStreamer->emitLabel(BeginOfSled);
EmitToStreamer(*OutStreamer,
MCInstBuilder(SystemZ::J)
.addExpr(MCSymbolRefExpr::create(EndOfSled, OutContext)));
EmitNop(OutContext, *OutStreamer, 2, getSubtargetInfo());
EmitToStreamer(*OutStreamer,
MCInstBuilder(SystemZ::LLILF).addReg(SystemZ::R2D).addImm(0));
EmitToStreamer(*OutStreamer,
MCInstBuilder(SystemZ::BRASL)
.addReg(SystemZ::R14D)
.addExpr(MCSymbolRefExpr::create(
FuncEntry, MCSymbolRefExpr::VK_PLT, OutContext)));
OutStreamer->emitLabel(EndOfSled);
recordSled(BeginOfSled, MI, SledKind::FUNCTION_ENTER, 2);
}

void SystemZAsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
SystemZMCInstLower &Lower) {
unsigned OpCode = MI.getOperand(0).getImm();
MCSymbol *FallthroughLabel = nullptr;
if (OpCode == SystemZ::CondReturn) {
FallthroughLabel = OutContext.createTempSymbol();
int64_t Cond0 = MI.getOperand(1).getImm();
int64_t Cond1 = MI.getOperand(2).getImm();
EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BRC)
.addImm(Cond0)
.addImm(Cond1 ^ Cond0)
.addExpr(MCSymbolRefExpr::create(
FallthroughLabel, OutContext)));
}
// .begin:
// br %r14 # -> stmg %r2, %r15, 24(%r15)
// nop
// nop
// llilf %2,FuncID
// j __xray_FunctionExit@GOT
//
// Update compiler-rt/lib/xray/xray_s390x.cpp accordingly when number
// of instructions change.
bool HasVectorFeature =
TM.getMCSubtargetInfo()->hasFeature(SystemZ::FeatureVector) &&
!TM.getMCSubtargetInfo()->hasFeature(SystemZ::FeatureSoftFloat);
MCSymbol *FuncExit = OutContext.getOrCreateSymbol(
HasVectorFeature ? "__xray_FunctionExitVec" : "__xray_FunctionExit");
MCSymbol *BeginOfSled = OutContext.createTempSymbol("xray_sled_", true);
OutStreamer->emitLabel(BeginOfSled);
EmitToStreamer(*OutStreamer,
MCInstBuilder(SystemZ::BR).addReg(SystemZ::R14D));
EmitNop(OutContext, *OutStreamer, 4, getSubtargetInfo());
EmitToStreamer(*OutStreamer,
MCInstBuilder(SystemZ::LLILF).addReg(SystemZ::R2D).addImm(0));
EmitToStreamer(*OutStreamer,
MCInstBuilder(SystemZ::J)
.addExpr(MCSymbolRefExpr::create(
FuncExit, MCSymbolRefExpr::VK_PLT, OutContext)));
if (FallthroughLabel)
OutStreamer->emitLabel(FallthroughLabel);
recordSled(BeginOfSled, MI, SledKind::FUNCTION_EXIT, 2);
}

// The *alignment* of 128-bit vector types is different between the software
// and hardware vector ABIs. If the there is an externally visible use of a
// vector type in the module it should be annotated with an attribute.
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,15 @@ class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode, raw_ostream &OS) override;

bool runOnMachineFunction(MachineFunction &MF) override {
AsmPrinter::runOnMachineFunction(MF);

// Emit the XRay table for this function.
emitXRayTable();

return false;
}

bool doInitialization(Module &M) override {
SM.reset();
return AsmPrinter::doInitialization(M);
Expand All @@ -124,6 +133,9 @@ class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL);
void LowerSTACKMAP(const MachineInstr &MI);
void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower);
void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
SystemZMCInstLower &Lower);
void LowerPATCHABLE_RET(const MachineInstr &MI, SystemZMCInstLower &Lower);
void emitAttributes(Module &M);
};
} // end namespace llvm
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/MC/MCInstrDesc.h"
Expand Down Expand Up @@ -1792,6 +1793,10 @@ unsigned SystemZInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
return MI.getOperand(1).getImm();
else if (MI.getOpcode() == SystemZ::FENTRY_CALL)
return 6;
if (MI.getOpcode() == TargetOpcode::PATCHABLE_FUNCTION_ENTER)
return 18;
if (MI.getOpcode() == TargetOpcode::PATCHABLE_RET)
return 18 + (MI.getOperand(0).getImm() == SystemZ::CondReturn ? 4 : 0);

return MI.getDesc().getSize();
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ class SystemZSubtarget : public SystemZGenSubtargetInfo {
bool GETTER() const { return ATTRIBUTE; }
#include "SystemZGenSubtargetInfo.inc"

bool isXRaySupported() const override { return true; }

bool isAddressedViaADA(const GlobalValue *GV) const;

// Return true if GV can be accessed using LARL for reloc model RM
Expand Down
Loading