-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Implement readcyclecounter for GFX12 #76965
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesFull diff: https://github.com/llvm/llvm-project/pull/76965.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 060fb66d38f7bc..86d2d6cf3c5ebc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -761,6 +761,12 @@ def FeatureShaderCyclesRegister : SubtargetFeature<"shader-cycles-register",
"Has SHADER_CYCLES hardware register"
>;
+def FeatureShaderCyclesHiLoRegisters : SubtargetFeature<"shader-cycles-hi-lo-registers",
+ "HasShaderCyclesHiLoRegisters",
+ "true",
+ "Has SHADER_CYCLES_HI/LO hardware registers"
+>;
+
def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
"HasMadMacF32Insts",
"true",
@@ -1469,7 +1475,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureNSAEncoding,
FeaturePartialNSAEncoding,
FeatureWavefrontSize32,
- FeatureShaderCyclesRegister,
+ FeatureShaderCyclesHiLoRegisters,
FeatureArchitectedFlatScratch,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
@@ -1970,6 +1976,8 @@ def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,
AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>;
+def HasShaderCyclesHiLoRegisters : Predicate<"Subtarget->hasShaderCyclesHiLoRegisters()">;
+
def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
AssemblerPredicate<(all_of FeatureFP8Insts)>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 91a70930326955..4fef389eeacbea 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -176,6 +176,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasGetWaveIdInst = false;
bool HasSMemTimeInst = false;
bool HasShaderCyclesRegister = false;
+ bool HasShaderCyclesHiLoRegisters = false;
bool HasVOP3Literal = false;
bool HasNoDataDepHazard = false;
bool FlatAddressSpace = false;
@@ -819,6 +820,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasShaderCyclesRegister;
}
+ bool hasShaderCyclesHiLoRegisters() const {
+ return HasShaderCyclesHiLoRegisters;
+ }
+
bool hasVOP3Literal() const {
return HasVOP3Literal;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4f4bc45e49b43e..041355716b0825 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4827,6 +4827,48 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::GET_SHADERCYCLESHILO: {
+ assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ DebugLoc DL = MI.getDebugLoc();
+ // The algorithm is:
+ //
+ // hi1 = getreg(SHADER_CYCLES_HI)
+ // lo1 = getreg(SHADER_CYCLES_LO)
+ // hi2 = getreg(SHADER_CYCLES_HI)
+ //
+ // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
+ // Otherwise there was overflow and the result is hi2:0. In both cases the
+ // result should represent the actual time at some point during the sequence
+ // of three getregs.
+ Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
+ .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
+ 0, 32));
+ Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
+ .addImm(
+ AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES, 0, 32));
+ Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
+ .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
+ 0, 32));
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
+ .addReg(RegHi1)
+ .addReg(RegHi2);
+ Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
+ .addReg(RegLo1)
+ .addImm(0);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
+ .add(MI.getOperand(0))
+ .addReg(RegLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(RegHi2)
+ .addImm(AMDGPU::sub1);
+ MI.eraseFromParent();
+ return BB;
+ }
case AMDGPU::SI_INDIRECT_SRC_V1:
case AMDGPU::SI_INDIRECT_SRC_V2:
case AMDGPU::SI_INDIRECT_SRC_V4:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f9bc623abcd04b..55471107d41b53 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -316,6 +316,12 @@ def S_USUBO_PSEUDO : SPseudoInstSI <
(outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
>;
+let OtherPredicates = [HasShaderCyclesHiLoRegisters] in
+def GET_SHADERCYCLESHILO : SPseudoInstSI<
+ (outs SReg_64:$sdst), (ins),
+ [(set SReg_64:$sdst, (i64 (readcyclecounter)))]
+>;
+
} // End usesCustomInserter = 1, Defs = [SCC]
let usesCustomInserter = 1 in {
diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index 7e0a486c8191e3..17b3fdc04ec934 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -8,12 +8,19 @@
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-SDAG -check-prefix=GCN %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
declare i64 @llvm.readcyclecounter() #0
; GCN-LABEL: {{^}}test_readcyclecounter:
; MEMTIME-DAG: s_memtime s{{\[[0-9]+:[0-9]+\]}}
; GCN-DAG: s_load_{{dwordx2|b64}}
+; GFX12: s_getreg_b32 [[HI1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
+; GFX12: s_getreg_b32 [[LO1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_LO)
+; GFX12: s_getreg_b32 [[HI2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
+; GFX12: s_cmp_eq_u32 [[HI1]], [[HI2]]
+; GFX12: s_cselect_b32 {{s[0-9]+}}, [[LO1]], 0
; GCN-DAG: lgkmcnt
; MEMTIME: store_dwordx2
; SIVI-NOT: lgkmcnt
@@ -43,8 +50,13 @@ define amdgpu_kernel void @test_readcyclecounter(ptr addrspace(1) %out) #0 {
;
; GCN-LABEL: {{^}}test_readcyclecounter_smem:
; MEMTIME-DAG: s_memtime
+; GFX12: s_getreg_b32 [[HI1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
+; GFX12: s_getreg_b32 [[LO1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_LO)
+; GFX12: s_getreg_b32 [[HI2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
; GCN-DAG: s_load_{{dword|b32|b64}}
; GETREG-DAG: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_SHADER_CYCLES, 0, 20)
+; GFX12: s_cmp_eq_u32 [[HI1]], [[HI2]]
+; GFX12: s_cselect_b32 {{s[0-9]+}}, [[LO1]], 0
define amdgpu_cs i32 @test_readcyclecounter_smem(ptr addrspace(4) inreg %in) #0 {
%cycle0 = call i64 @llvm.readcyclecounter()
%in.v = load i64, ptr addrspace(4) %in
|
case AMDGPU::GET_SHADERCYCLESHILO: { | ||
assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters()); | ||
MachineRegisterInfo &MRI = MF->getRegInfo(); | ||
DebugLoc DL = MI.getDebugLoc(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
const ref
// If hi1 == hi2 then there was no overflow and the result is hi2:lo1. | ||
// Otherwise there was overflow and the result is hi2:0. In both cases the | ||
// result should represent the actual time at some point during the sequence | ||
// of three getregs. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is pretty ugly
No description provided.