-
Notifications
You must be signed in to change notification settings - Fork 15.1k
Added Conditions of SM90 and ISA7.8 for Using cvt.ftz.f32.bf16 Instruction #165774
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be notified. If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write permissions for the repository. In which case you can instead tag reviewers by name in a comment by using If you have received no comments on your PR for a week, you can request a review by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate is once a week. Please remember that you are asking for valuable time from other developers. If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
|
@llvm/pr-subscribers-backend-nvptx Author: None (yasmincs) ChangesUpdated the conditions for generating the cvt.ftz.f32.bf16 instruction to include sm90 and isa7.8, so that ftz is only generated when it is supported. Patch is 23.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165774.diff 2 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index dfde0cca0f00c..af1601f8b60fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2268,7 +2268,7 @@ def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN)>;
def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE)>;
// fpextend bf16 -> f32
-def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ,hasPTX<78>, hasSM<90>]>;
def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
// fpextend f16 -> f64
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 4d930cd9e57c0..3626613cf8511 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -2,6 +2,7 @@
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM90-FTZ %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %}
@@ -55,13 +56,24 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fadd(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
+; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
+; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fadd(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -118,13 +130,24 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fsub(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
+; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
+; SM90-FTZ-NEXT: sub.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fsub(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -195,16 +218,27 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_faddx2_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_faddx2_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_faddx2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
+; SM90-FTZ-NEXT: add.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_faddx2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -275,16 +309,27 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fsubx2_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fsubx2_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fsubx2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
+; SM90-FTZ-NEXT: sub.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fsubx2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -355,16 +400,27 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmulx2_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmulx2_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fmulx2(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
+; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
+; SM90-FTZ-NEXT: mul.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fmulx2(
; SM90: {
; SM90-NEXT: .reg .b32 %r<4>;
@@ -441,16 +497,34 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1;
; SM80-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2;
; SM80-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4;
; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fdiv(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<5>;
+; SM90-FTZ-NEXT: .reg .b32 %r<8>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
+; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1;
+; SM90-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2;
+; SM90-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4;
+; SM90-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r7;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fdiv(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<5>;
@@ -527,10 +601,21 @@ define float @test_fpext_float(bfloat %a) #0 {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fpext_float(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fpext_float(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -585,6 +670,17 @@ define bfloat @test_fptrunc_float(float %a) #0 {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fptrunc_float(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fptrunc_float_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %r1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fptrunc_float(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -637,12 +733,23 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: add.rn.ftz.f32 %r2, %r1, 0f3F800000;
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2;
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fadd_imm_1(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
+; SM90-FTZ-NEXT: mov.b16 %rs2, 0x3F80;
+; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fadd_imm_1(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<4>;
@@ -750,18 +857,43 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4;
; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1;
; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs8;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r6, %rs7;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r7, %rs6;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r8, %rs5;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r9, %rs4;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r10, %rs3;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r11, %rs2;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r12, %rs1;
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_extload_bf16x8(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<9>;
+; SM90-FTZ-NEXT: .reg .b32 %r<13>;
+; SM90-FTZ-NEXT: .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
+; SM90-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3;
+; SM90-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4;
+; SM90-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1;
+; SM90-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2;
+; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1;
+; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
+; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_extload_bf16x8(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<9>;
@@ -825,12 +957,24 @@ define i16 @test_fptosi_i16(bfloat %a) {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: cvt.rzi.ftz.s16.f32 %rs2, %r1;
; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fptosi_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rzi.s16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fptosi_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -880,12 +1024,24 @@ define i16 @test_fptoui_i16(bfloat %a) {
; SM80-FTZ-EMPTY:
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0];
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1;
; SM80-FTZ-NEXT: cvt.rzi.ftz.u16.f32 %rs2, %r1;
; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2;
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_fptoui_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rzi.u16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_fptoui_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -945,6 +1101,16 @@ define bfloat @test_sitofp_i16(i16 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_sitofp_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.s16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_sitofp_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1002,6 +1168,16 @@ define bfloat @test_uitofp_i8(i8 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i8(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i8(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1070,6 +1246,21 @@ define bfloat @test_uitofp_i1(i1 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i1(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .pred %p<2>;
+; SM90-FTZ-NEXT: .reg .b16 %rs<4>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0];
+; SM90-FTZ-NEXT: and.b16 %rs2, %rs1, 1;
+; SM90-FTZ-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; SM90-FTZ-NEXT: selp.b32 %r1, 1, 0, %p1;
+; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs3, %r1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i1(
; SM90: {
; SM90-NEXT: .reg .pred %p<2>;
@@ -1132,6 +1323,16 @@ define bfloat @test_uitofp_i16(i16 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i16(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i16(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<3>;
@@ -1188,6 +1389,17 @@ define bfloat @test_uitofp_i32(i32 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i32(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs1, %r1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i32(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1248,6 +1460,17 @@ define bfloat @test_uitofp_i64(i64 %a) {
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
; SM80-FTZ-NEXT: ret;
;
+; SM90-FTZ-LABEL: test_uitofp_i64(
+; SM90-FTZ: {
+; SM90-FTZ-NEXT: .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT: .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT: // %bb.0:
+; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0];
+; SM90-FTZ-NEXT: cvt.rn.bf16.u64 %rs1, %rd1;
+; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT: ret;
+;
; SM90-LABEL: test_uitofp_i64(
; SM90: {
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1302,12 +1525,22 @@ define bfloat @test_roundeven(bfloat %a) {
; SM80-FTZ-EMPTY:...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Updated the conditions for generating the cvt.ftz.f32.bf16 instruction to include sm90 and isa7.8, so that ftz is only generated when it is supported.