diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 86d6f7c3fc3a3..5b46b29ea3420 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1516,20 +1516,19 @@ def : Pat<(i16 (sext_inreg (trunc (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtN // Byte extraction via shift/trunc/sext -def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), - (CVT_s8_s32 $s, CvtNONE)>; -def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)), +def : Pat<(i16 (sext_inreg (trunc i32:$s), i8)), (CVT_s8_s32 $s, CvtNONE)>; +def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), (CVT_s8_s64 $s, CvtNONE)>; + +def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), (BFE_S32rii $s, imm:$o, 8)>; +def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), (BFE_S64rii $s, imm:$o, 8)>; + +def : Pat<(i16 (sext_inreg (trunc (srl i32:$s, (i32 imm:$o))), i8)), (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>; -def : Pat<(sext_inreg (srl i32:$s, (i32 imm:$o)), i8), - (BFE_S32rii $s, imm:$o, 8)>; +def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)), + (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>; + def : Pat<(i16 (sra (i16 (trunc i32:$s)), (i32 8))), (CVT_s8_s32 (BFE_S32rii $s, 8, 8), CvtNONE)>; -def : Pat<(sext_inreg (srl i64:$s, (i32 imm:$o)), i8), - (BFE_S64rii $s, imm:$o, 8)>; -def : Pat<(i16 (sext_inreg (trunc i64:$s), i8)), - (CVT_s8_s64 $s, CvtNONE)>; -def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)), - (CVT_s8_s64 (BFE_S64rii $s, imm:$o, 8), CvtNONE)>; //----------------------------------- // Comparison instructions (setp, set) @@ -1713,45 +1712,34 @@ def : Pat<(i64 frameindex:$fi), (LEA_ADDRi64 (to_tframeindex $fi), 0)>; //----------------------------------- // Comparison and Selection //----------------------------------- +// TODO: These patterns seem very specific and brittle. We should try to find +// a more general solution. def cond_signed : PatLeaf<(cond), [{ return isSignedIntSetCC(N->get()); }]>; -def cond_not_signed : PatLeaf<(cond), [{ - return !isSignedIntSetCC(N->get()); -}]>; +// A 16-bit signed comparison of sign-extended byte extracts can be converted +// to 32-bit comparison if we change the PRMT to sign-extend the extracted +// bytes. +def : Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)), + (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)), + cond_signed:$cc), + (SETP_i32rr (PRMT_B32rii i32:$a, 0, (to_sign_extend_selector $sel_a), PrmtNONE), + (PRMT_B32rii i32:$b, 0, (to_sign_extend_selector $sel_b), PrmtNONE), + (cond2cc $cc))>; + +// A 16-bit comparison of truncated byte extracts can be be converted to 32-bit +// comparison because we know that the truncate is just trancating off zeros +// and that the most-significant byte is also zeros so the meaning of signed and +// unsigned comparisons will not be changed. +def : Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), + (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), + cond:$cc), + (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), + (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), + (cond2cc $cc))>; -// comparisons of i8 extracted with PRMT as i32 -// It's faster to do comparison directly on i32 extracted by PRMT, -// instead of the long conversion and sign extending. -def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), i8)), - (i16 (sext_inreg (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), i8)), - cond_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - -def: Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)), - (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)), - cond_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - -def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), - (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), - cond_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; - -def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), - (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), - cond_not_signed:$cc), - (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), - (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), - (cond2cc $cc))>; def SDTDeclareArrayParam : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 06c2cc83ca43c..26336b83c4f96 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -343,61 +343,77 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-LABEL: test_smax( ; O0: { ; O0-NEXT: .reg .pred %p<5>; -; O0-NEXT: .reg .b32 %r<18>; +; O0-NEXT: .reg .b32 %r<26>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: ; O0-NEXT: ld.param.b32 %r2, [test_smax_param_1]; ; O0-NEXT: ld.param.b32 %r1, [test_smax_param_0]; -; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U; ; O0-NEXT: setp.gt.s32 %p1, %r4, %r3; -; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U; +; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U; ; O0-NEXT: setp.gt.s32 %p2, %r6, %r5; -; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U; +; O0-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U; ; O0-NEXT: setp.gt.s32 %p3, %r8, %r7; -; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U; +; O0-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U; ; O0-NEXT: setp.gt.s32 %p4, %r10, %r9; -; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4; -; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; O0-NEXT: st.param.b32 [func_retval0], %r17; +; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U; +; O0-NEXT: selp.b32 %r16, %r15, %r14, %p4; +; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U; +; O0-NEXT: selp.b32 %r18, %r17, %r13, %p3; +; O0-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U; +; O0-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U; +; O0-NEXT: selp.b32 %r21, %r20, %r12, %p2; +; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U; +; O0-NEXT: selp.b32 %r23, %r22, %r11, %p1; +; O0-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U; +; O0-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r25; ; O0-NEXT: ret; ; ; O3-LABEL: test_smax( ; O3: { ; O3-NEXT: .reg .pred %p<5>; -; O3-NEXT: .reg .b32 %r<18>; +; O3-NEXT: .reg .b32 %r<26>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_smax_param_0]; ; O3-NEXT: ld.param.b32 %r2, [test_smax_param_1]; -; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U; ; O3-NEXT: setp.gt.s32 %p1, %r4, %r3; -; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U; +; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U; ; O3-NEXT: setp.gt.s32 %p2, %r6, %r5; -; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U; +; O3-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U; ; O3-NEXT: setp.gt.s32 %p3, %r8, %r7; -; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U; +; O3-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U; ; O3-NEXT: setp.gt.s32 %p4, %r10, %r9; -; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4; -; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; O3-NEXT: st.param.b32 [func_retval0], %r17; +; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U; +; O3-NEXT: selp.b32 %r16, %r15, %r14, %p4; +; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U; +; O3-NEXT: selp.b32 %r18, %r17, %r13, %p3; +; O3-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U; +; O3-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U; +; O3-NEXT: selp.b32 %r21, %r20, %r12, %p2; +; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U; +; O3-NEXT: selp.b32 %r23, %r22, %r11, %p1; +; O3-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U; +; O3-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r25; ; O3-NEXT: ret; %cmp = icmp sgt <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b @@ -473,61 +489,77 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-LABEL: test_smin( ; O0: { ; O0-NEXT: .reg .pred %p<5>; -; O0-NEXT: .reg .b32 %r<18>; +; O0-NEXT: .reg .b32 %r<26>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: ; O0-NEXT: ld.param.b32 %r2, [test_smin_param_1]; ; O0-NEXT: ld.param.b32 %r1, [test_smin_param_0]; -; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U; ; O0-NEXT: setp.le.s32 %p1, %r4, %r3; -; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U; +; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U; ; O0-NEXT: setp.le.s32 %p2, %r6, %r5; -; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U; +; O0-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U; ; O0-NEXT: setp.le.s32 %p3, %r8, %r7; -; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U; +; O0-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U; ; O0-NEXT: setp.le.s32 %p4, %r10, %r9; -; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4; -; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; O0-NEXT: st.param.b32 [func_retval0], %r17; +; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U; +; O0-NEXT: selp.b32 %r16, %r15, %r14, %p4; +; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U; +; O0-NEXT: selp.b32 %r18, %r17, %r13, %p3; +; O0-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U; +; O0-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U; +; O0-NEXT: selp.b32 %r21, %r20, %r12, %p2; +; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U; +; O0-NEXT: selp.b32 %r23, %r22, %r11, %p1; +; O0-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U; +; O0-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r25; ; O0-NEXT: ret; ; ; O3-LABEL: test_smin( ; O3: { ; O3-NEXT: .reg .pred %p<5>; -; O3-NEXT: .reg .b32 %r<18>; +; O3-NEXT: .reg .b32 %r<26>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_smin_param_0]; ; O3-NEXT: ld.param.b32 %r2, [test_smin_param_1]; -; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U; ; O3-NEXT: setp.le.s32 %p1, %r4, %r3; -; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U; +; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U; ; O3-NEXT: setp.le.s32 %p2, %r6, %r5; -; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U; +; O3-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U; ; O3-NEXT: setp.le.s32 %p3, %r8, %r7; -; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U; +; O3-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U; ; O3-NEXT: setp.le.s32 %p4, %r10, %r9; -; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4; -; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; O3-NEXT: st.param.b32 [func_retval0], %r17; +; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U; +; O3-NEXT: selp.b32 %r16, %r15, %r14, %p4; +; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U; +; O3-NEXT: selp.b32 %r18, %r17, %r13, %p3; +; O3-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U; +; O3-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U; +; O3-NEXT: selp.b32 %r21, %r20, %r12, %p2; +; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U; +; O3-NEXT: selp.b32 %r23, %r22, %r11, %p1; +; O3-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U; +; O3-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r25; ; O3-NEXT: ret; %cmp = icmp sle <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b