Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2391,6 +2391,61 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
if (MI.isMetaInstruction())
Result = false;

else if (MI.isInlineAsm()) {
auto &TRI = TII->getRegisterInfo();
auto &MRI = MI.getParent()->getParent()->getRegInfo();
bool SGPR_used = false, VGPR_used = false, VMFMA_used = false,
MayLoad = MI.mayLoad(), MayStore = MI.mayStore();
for (const MachineOperand &Operand : MI.operands())
if (Operand.isReg()) {
auto &RegClass = *TRI.getRegClassForOperandReg(MRI, Operand);
if (TRI.isVGPRClass(&RegClass))
VGPR_used = true;
if (TRI.isAGPRClass(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128)
VMFMA_used = true;
if (TRI.isSGPRClass(&RegClass))
SGPR_used = true;
}

unsigned long InlineAsmMask = 0;
if (VGPR_used && !SGPR_used && !VMFMA_used && !MayLoad && !MayStore)
InlineAsmMask |= (unsigned long)SchedGroupMask::VALU;
if (SGPR_used && !MayLoad && !MayStore)
InlineAsmMask |= (unsigned long)SchedGroupMask::SALU;
if (VMFMA_used)
InlineAsmMask |= (unsigned long)SchedGroupMask::MFMA;
if (VGPR_used && MayLoad)
InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM_READ;
if (VGPR_used && MayStore)
InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM_WRITE;
if (!VGPR_used && MayLoad)
InlineAsmMask |= (unsigned long)SchedGroupMask::DS_READ;
if (!VGPR_used && MayStore)
InlineAsmMask |= (unsigned long)SchedGroupMask::DS_WRITE;
if (InlineAsmMask & (unsigned long)SchedGroupMask::VALU ||
InlineAsmMask & (unsigned long)SchedGroupMask::SALU)
InlineAsmMask |= (unsigned long)SchedGroupMask::ALU;
if (InlineAsmMask & (unsigned long)SchedGroupMask::DS_READ ||
InlineAsmMask & (unsigned long)SchedGroupMask::DS_WRITE)
InlineAsmMask |= (unsigned long)SchedGroupMask::DS;
if (InlineAsmMask & (unsigned long)SchedGroupMask::VMEM_READ ||
InlineAsmMask & (unsigned long)SchedGroupMask::VMEM_WRITE)
InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM;

Result = ((unsigned long)SGMask & InlineAsmMask) != 0;

// Original implementation
#if 0
StringRef Text = MI.getOperand(0).getSymbolName();
if (Text.find("SGMASK:") != std::string::npos) {
Text = Text.substr(Text.find("SGMASK:") + strlen("SGMASK:"));
Text = Text.substr(0, Text.find_first_of(" \t\r\n"));
unsigned long InlineAsmMask = std::stoul(Text.str(), nullptr, 0);
Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
}
#endif
}

else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
(TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
TII->isTRANS(MI)))
Expand Down
74 changes: 74 additions & 0 deletions llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
; RUN: llc -O3 -mcpu=gfx942 < %s | FileCheck %s
; CHECK: v_add_f32_e32
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: v_mfma_f64
; CHECK-NEXT: ;;#ASMEND
; CHECK: v_add_f32_e32
; ModuleID = '<stdin>'
source_filename = "llvm-link"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"

@llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_bffb86447932ec40 to ptr)], section "llvm.metadata"
@__hip_cuid_bffb86447932ec40 = addrspace(1) global i8 0
Comment on lines +12 to +13
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this needed for the test to work?


; Function Attrs: convergent mustprogress norecurse nounwind
define protected amdgpu_kernel void @_Z17group4_sum_floaatPfPKfi(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
entry:
%0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
%mul = shl i32 %0, 6
%1 = tail call i32 @llvm.amdgcn.workitem.id.x()
%add = add i32 %mul, %1
%cmp = icmp slt i32 %add, %length
br i1 %cmp, label %if.then, label %if.end

if.then: ; preds = %entry
%idx.ext = sext i32 %add to i64
%add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext
%mul3 = shl nsw i32 %add, 2
%idx.ext4 = sext i32 %mul3 to i64
%add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4
%2 = load <2 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0
%a20 = add i64 %idx.ext4, 2
%a21 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %a20
%a22 = load <2 x float>, ptr addrspace(1) %a21, align 16, !tbaa !0
%3 = extractelement <2 x float> %a22, i64 1
%4 = extractelement <2 x float> %2, i64 0
%5 = tail call contract noundef float asm "v_mfma_f64_4x4x4f64 $0, $1, $2, 0", "=a,v,v"(<2 x float> %2, <2 x float> %a22) #3, !srcloc !3
%6 = extractelement <2 x float> %2, i64 1
%7 = extractelement <2 x float> %a22, i64 0
%add6 = fadd contract float %6, %7
%add7 = fadd contract float %5, %add6
store float %add7, ptr addrspace(1) %add.ptr, align 4, !tbaa !4
tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 5, i32 0)
tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
br label %if.end

if.end: ; preds = %if.then, %entry
ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1

; Function Attrs: convergent nocallback nofree nounwind willreturn
declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg) #2

attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { convergent nocallback nofree nounwind willreturn }
attributes #3 = { convergent nounwind memory(none) }

!0 = !{!1, !1, i64 0}
!1 = !{!"omnipotent char", !2, i64 0}
!2 = !{!"Simple C++ TBAA"}
!3 = !{i64 129}
!4 = !{!5, !5, i64 0}
!5 = !{!"float", !1, i64 0}
Loading