506 changes: 506 additions & 0 deletions clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp

Large diffs are not rendered by default.

52 changes: 52 additions & 0 deletions clang/tools/clang-sycl-linker/SYCLLinkOpts.td
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
include "llvm/Option/OptParser.td"

def LinkerOnlyOption : OptionFlag;

def help : Flag<["-", "--"], "help">,
HelpText<"Display available options (--help-hidden for more)">;

def help_hidden : Flag<["-", "--"], "help-hidden">,
HelpText<"Display all available options">;

def verbose : Flag<["-"], "v">, HelpText<"Print verbose information">;
def version : Flag<["--"], "version">,
HelpText<"Display the version number and exit">;

def o : JoinedOrSeparate<["-"], "o">, MetaVarName<"<path>">,
HelpText<"Path to file to write output">;
def output : Separate<["--"], "output-file">, Alias<o>, Flags<[HelpHidden]>,
HelpText<"Alias for -o">;

def library_path_EQ : Joined<["--", "-"], "library-path=">,
Flags<[HelpHidden]>, HelpText<"Add <dir> to the library search path">;

def device_libs_EQ : CommaJoined<["--", "-"], "device-libs=">,
Flags<[LinkerOnlyOption]>,
HelpText<"A comma separated list of device libraries that are linked during the device link.">;

def triple : Joined<["--"], "triple">,
HelpText<"The device target triple">;
def arch : Separate<["--", "-"], "arch">,
HelpText<"Specify the name of the target architecture.">;

def save_temps : Flag<["--", "-"], "save-temps">,
Flags<[LinkerOnlyOption]>, HelpText<"Save intermediate results">;

def dry_run : Flag<["--", "-"], "dry-run">, Flags<[LinkerOnlyOption]>,
HelpText<"Print generated commands without running.">;

def spirv_dump_device_code_EQ : Joined<["--", "-"], "spirv-dump-device-code=">,
Flags<[LinkerOnlyOption]>,
HelpText<"Path to the folder where the tool dumps SPIR-V device code. Other formats aren't dumped.">;

def is_windows_msvc_env : Flag<["--", "-"], "is-windows-msvc-env">,
Flags<[LinkerOnlyOption, HelpHidden]>;

def llvm_spirv_path_EQ : Joined<["--"], "llvm-spirv-path=">,
Flags<[LinkerOnlyOption]>, MetaVarName<"<dir>">,
HelpText<"Set the system llvm-spirv path">;

// Options to pass to llvm-spirv tool
def llvm_spirv_options_EQ : Joined<["--", "-"], "llvm-spirv-options=">,
Flags<[LinkerOnlyOption]>,
HelpText<"Options that will control llvm-spirv step">;
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ def : GINodeEquiv<G_STORE, AMDGPUst_glue> {

def : GINodeEquiv<G_LOAD, AMDGPUatomic_ld_glue> {
bit CheckMMOIsAtomic = 1;
let IfSignExtend = G_SEXTLOAD;
let IfZeroExtend = G_ZEXTLOAD;
}

def : GINodeEquiv<G_STORE, AMDGPUatomic_st_glue> {
Expand Down
21 changes: 17 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -504,23 +504,36 @@ def zextloadi16_#as : PatFrag<(ops node:$ptr), (zextloadi16 node:$ptr)> {

def atomic_load_8_#as : PatFrag<(ops node:$ptr), (atomic_load_8 node:$ptr)> {
let IsAtomic = 1;
let MemoryVT = i8;
}

def atomic_load_16_#as : PatFrag<(ops node:$ptr), (atomic_load_16 node:$ptr)> {
let IsAtomic = 1;
let MemoryVT = i16;
}

def atomic_load_32_#as : PatFrag<(ops node:$ptr), (atomic_load_32 node:$ptr)> {
let IsAtomic = 1;
let MemoryVT = i32;
}

def atomic_load_64_#as : PatFrag<(ops node:$ptr), (atomic_load_64 node:$ptr)> {
let IsAtomic = 1;
let MemoryVT = i64;
}

def atomic_load_zext_8_#as : PatFrag<(ops node:$ptr), (atomic_load_zext_8 node:$ptr)> {
let IsAtomic = 1;
}

def atomic_load_sext_8_#as : PatFrag<(ops node:$ptr), (atomic_load_sext_8 node:$ptr)> {
let IsAtomic = 1;
}

def atomic_load_zext_16_#as : PatFrag<(ops node:$ptr), (atomic_load_zext_16 node:$ptr)> {
let IsAtomic = 1;
}

def atomic_load_sext_16_#as : PatFrag<(ops node:$ptr), (atomic_load_sext_16 node:$ptr)> {
let IsAtomic = 1;
}

} // End let AddressSpaces
} // End foreach as

Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/BUFInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -983,15 +983,20 @@ defm BUFFER_LOAD_LDS_U16 : MUBUF_Pseudo_Loads_LDSOpc <
>;

defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_zext_8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_zext_16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, atomic_load_sext_8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, atomic_load_sext_16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, atomic_load_sext_16_global>;

foreach vt = Reg32Types.types in {
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", vt, load_global>;
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/DSInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -795,12 +795,19 @@ defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">;

defm : DSReadPat_mc <DS_READ_U8, i16, "atomic_load_8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "atomic_load_8_local">;
defm : DSReadPat_mc <DS_READ_U8, i16, "atomic_load_zext_8_local">;
defm : DSReadPat_mc <DS_READ_U8, i32, "atomic_load_zext_8_local">;
defm : DSReadPat_mc <DS_READ_I8, i16, "atomic_load_sext_8_local">;
defm : DSReadPat_mc <DS_READ_I8, i32, "atomic_load_sext_8_local">;
defm : DSReadPat_mc <DS_READ_U16, i16, "atomic_load_16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "atomic_load_16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "atomic_load_zext_16_local">;
defm : DSReadPat_mc <DS_READ_I16, i32, "atomic_load_sext_16_local">;
defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;

let OtherPredicates = [D16PreservesUnusedBits] in {
// TODO: Atomic loads
def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2i16>;
def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2f16>;
def : DSReadPat_D16<DS_READ_U8_D16_HI, az_extloadi8_d16_hi_local, v2i16>;
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1355,18 +1355,25 @@ let OtherPredicates = [HasFlatAddressSpace] in {

def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_16_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;

def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
Expand Down Expand Up @@ -1456,6 +1463,7 @@ def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
}

let OtherPredicates = [D16PreservesUnusedBits] in {
// TODO: Handle atomic loads
def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>;
def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>;
def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>;
Expand All @@ -1477,8 +1485,14 @@ let OtherPredicates = [HasFlatGlobalInsts] in {

defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_16_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>;
Expand All @@ -1488,6 +1502,8 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, extloadi16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, atomic_load_sext_16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, load_global, i16>;

foreach vt = Reg32Types.types in {
Expand Down Expand Up @@ -1525,6 +1541,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global,
}

let OtherPredicates = [D16PreservesUnusedBits] in {
// TODO: Handle atomic loads
defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2i16>;
defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_global, v2f16>;
defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_global, v2i16>;
Expand Down
45 changes: 45 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,18 @@ def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr)> {
let IsNonExtLoad = 1;
}

def atomic_load_zext_glue :
PatFrag<(ops node:$ptr), (AMDGPUatomic_ld_glue node:$ptr)> {
let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
let IsZeroExtLoad = true;
}

def atomic_load_sext_glue :
PatFrag<(ops node:$ptr), (AMDGPUatomic_ld_glue node:$ptr)> {
let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic?
let IsSignExtLoad = true;
}

def atomic_load_8_glue : PatFrag<(ops node:$ptr),
(AMDGPUatomic_ld_glue node:$ptr)> {
let IsAtomic = 1;
Expand All @@ -372,6 +384,30 @@ def atomic_load_64_glue : PatFrag<(ops node:$ptr),
let MemoryVT = i64;
}

def atomic_load_zext_8_glue : PatFrag<(ops node:$ptr),
(atomic_load_zext_glue node:$ptr)> {
let IsAtomic = 1;
let MemoryVT = i8;
}

def atomic_load_sext_8_glue : PatFrag<(ops node:$ptr),
(atomic_load_sext_glue node:$ptr)> {
let IsAtomic = 1;
let MemoryVT = i8;
}

def atomic_load_zext_16_glue : PatFrag<(ops node:$ptr),
(atomic_load_zext_glue node:$ptr)> {
let IsAtomic = 1;
let MemoryVT = i16;
}

def atomic_load_sext_16_glue : PatFrag<(ops node:$ptr),
(atomic_load_sext_glue node:$ptr)> {
let IsAtomic = 1;
let MemoryVT = i16;
}

def extload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> {
let IsLoad = 1;
let IsAnyExtLoad = 1;
Expand Down Expand Up @@ -453,6 +489,15 @@ def atomic_load_32_local_m0 : PatFrag<(ops node:$ptr),
(atomic_load_32_glue node:$ptr)>;
def atomic_load_64_local_m0 : PatFrag<(ops node:$ptr),
(atomic_load_64_glue node:$ptr)>;

def atomic_load_zext_8_local_m0 : PatFrag<(ops node:$ptr),
(atomic_load_zext_8_glue node:$ptr)>;
def atomic_load_sext_8_local_m0 : PatFrag<(ops node:$ptr),
(atomic_load_sext_8_glue node:$ptr)>;
def atomic_load_zext_16_local_m0 : PatFrag<(ops node:$ptr),
(atomic_load_zext_16_glue node:$ptr)>;
def atomic_load_sext_16_local_m0 : PatFrag<(ops node:$ptr),
(atomic_load_sext_16_glue node:$ptr)>;
} // End let AddressSpaces = LoadAddress_local.AddrSpaces


Expand Down
331 changes: 331 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_flat.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,331 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s

define i8 @atomic_load_flat_monotonic_i8(ptr %ptr) {
; GCN-LABEL: atomic_load_flat_monotonic_i8:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i8, ptr %ptr monotonic, align 1
ret i8 %load
}

define i32 @atomic_load_flat_monotonic_i8_zext_to_i32(ptr %ptr) {
; GCN-LABEL: atomic_load_flat_monotonic_i8_zext_to_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i8, ptr %ptr monotonic, align 1
%ext = zext i8 %load to i32
ret i32 %ext
}

define i32 @atomic_load_flat_monotonic_i8_sext_to_i32(ptr %ptr) {
; GFX7-LABEL: atomic_load_flat_monotonic_i8_sext_to_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_sbyte v2, v[0:1] glc
; GFX7-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: atomic_load_flat_monotonic_i8_sext_to_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_sbyte v2, v[0:1] glc
; GFX8-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: atomic_load_flat_monotonic_i8_sext_to_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_load_sbyte v2, v[0:1] glc
; GFX9-NEXT: flat_load_ubyte v3, v[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i8, ptr %ptr monotonic, align 1
%ext = sext i8 %load to i32
ret i32 %ext
}

define i16 @atomic_load_flat_monotonic_i8_zext_to_i16(ptr %ptr) {
; GCN-LABEL: atomic_load_flat_monotonic_i8_zext_to_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i8, ptr %ptr monotonic, align 1
%ext = zext i8 %load to i16
ret i16 %ext
}

define i16 @atomic_load_flat_monotonic_i8_sext_to_i16(ptr %ptr) {
; GFX7-LABEL: atomic_load_flat_monotonic_i8_sext_to_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_sbyte v2, v[0:1] glc
; GFX7-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: atomic_load_flat_monotonic_i8_sext_to_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_sbyte v2, v[0:1] glc
; GFX8-NEXT: flat_load_ubyte v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: atomic_load_flat_monotonic_i8_sext_to_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_load_sbyte v2, v[0:1] glc
; GFX9-NEXT: flat_load_ubyte v3, v[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i8, ptr %ptr monotonic, align 1
%ext = sext i8 %load to i16
ret i16 %ext
}

define i16 @atomic_load_flat_monotonic_i16(ptr %ptr) {
; GCN-LABEL: atomic_load_flat_monotonic_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr %ptr monotonic, align 2
ret i16 %load
}

define i32 @atomic_load_flat_monotonic_i16_zext_to_i32(ptr %ptr) {
; GCN-LABEL: atomic_load_flat_monotonic_i16_zext_to_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr %ptr monotonic, align 2
%ext = zext i16 %load to i32
ret i32 %ext
}

define i32 @atomic_load_flat_monotonic_i16_sext_to_i32(ptr %ptr) {
; GFX7-LABEL: atomic_load_flat_monotonic_i16_sext_to_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_sshort v2, v[0:1] glc
; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: atomic_load_flat_monotonic_i16_sext_to_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_sshort v2, v[0:1] glc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: atomic_load_flat_monotonic_i16_sext_to_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_load_sshort v2, v[0:1] glc
; GFX9-NEXT: flat_load_ushort v3, v[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr %ptr monotonic, align 2
%ext = sext i16 %load to i32
ret i32 %ext
}

define half @atomic_load_flat_monotonic_f16(ptr %ptr) {
; GCN-LABEL: atomic_load_flat_monotonic_f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%load = load atomic half, ptr %ptr monotonic, align 2
ret half %load
}

define bfloat @atomic_load_flat_monotonic_bf16(ptr %ptr) {
; GCN-LABEL: atomic_load_flat_monotonic_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%load = load atomic bfloat, ptr %ptr monotonic, align 2
ret bfloat %load
}

define i32 @atomic_load_flat_monotonic_f16_zext_to_i32(ptr %ptr) {
; GCN-LABEL: atomic_load_flat_monotonic_f16_zext_to_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%load = load atomic half, ptr %ptr monotonic, align 2
%cast = bitcast half %load to i16
%ext = zext i16 %cast to i32
ret i32 %ext
}

define i32 @atomic_load_flat_monotonic_bf16_zext_to_i32(ptr %ptr) {
; GCN-LABEL: atomic_load_flat_monotonic_bf16_zext_to_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%load = load atomic bfloat, ptr %ptr monotonic, align 2
%cast = bitcast bfloat %load to i16
%ext = zext i16 %cast to i32
ret i32 %ext
}

define i32 @atomic_load_flat_monotonic_i16_d16_hi_shift(ptr %ptr) {
; GCN-LABEL: atomic_load_flat_monotonic_i16_d16_hi_shift:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr %ptr monotonic, align 2
%ext = zext i16 %load to i32
%shl = shl i32 %ext, 16
ret i32 %shl
}

define <2 x i16> @atomic_load_flat_monotonic_i16_d16_hi_vector_insert(ptr %ptr, <2 x i16> %vec) {
; GFX7-LABEL: atomic_load_flat_monotonic_i16_d16_hi_vector_insert:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: atomic_load_flat_monotonic_i16_d16_hi_vector_insert:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_hi_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 1
ret <2 x i16> %insert
}

define i32 @atomic_load_flat_monotonic_i16_d16_lo_or(ptr %ptr, i16 %high) {
; GFX7-LABEL: atomic_load_flat_monotonic_i16_d16_lo_or:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: atomic_load_flat_monotonic_i16_d16_lo_or:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_lo_or:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr %ptr monotonic, align 2
%ext = zext i16 %load to i32
%high.ext = zext i16 %high to i32
%shl = shl i32 %high.ext, 16
%or = or i32 %shl, %ext
ret i32 %or
}

define <2 x i16> @atomic_load_flat_monotonic_i16_d16_lo_vector_insert(ptr %ptr, <2 x i16> %vec) {
; GFX7-LABEL: atomic_load_flat_monotonic_i16_d16_lo_vector_insert:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: atomic_load_flat_monotonic_i16_d16_lo_vector_insert:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: atomic_load_flat_monotonic_i16_d16_lo_vector_insert:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff0000
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%load = load atomic i16, ptr %ptr monotonic, align 2
%insert = insertelement <2 x i16> %vec, i16 %load, i32 0
ret <2 x i16> %insert
}
662 changes: 662 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_global.ll

Large diffs are not rendered by default.

509 changes: 509 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_load_local_2.ll

Large diffs are not rendered by default.