diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index 300addd7d4daf..dfcc97b5880f5 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -84,288 +84,290 @@ def i16 : VTInt<16, 6>; // 16-bit integer value def i32 : VTInt<32, 7>; // 32-bit integer value def i64 : VTInt<64, 8>; // 64-bit integer value def i128 : VTInt<128, 9>; // 128-bit integer value - -def bf16 : VTFP<16, 10>; // 16-bit brain floating point value -def f16 : VTFP<16, 11>; // 16-bit floating point value -def f32 : VTFP<32, 12>; // 32-bit floating point value -def f64 : VTFP<64, 13>; // 64-bit floating point value -def f80 : VTFP<80, 14>; // 80-bit floating point value -def f128 : VTFP<128, 15>; // 128-bit floating point value -def ppcf128 : VTFP<128, 16>; // PPC 128-bit floating point value - -def v1i1 : VTVec<1, i1, 17>; // 1 x i1 vector value -def v2i1 : VTVec<2, i1, 18>; // 2 x i1 vector value -def v3i1 : VTVec<3, i1, 19>; // 3 x i1 vector value -def v4i1 : VTVec<4, i1, 20>; // 4 x i1 vector value -def v5i1 : VTVec<5, i1, 21>; // 5 x i1 vector value -def v6i1 : VTVec<6, i1, 22>; // 6 x i1 vector value -def v7i1 : VTVec<7, i1, 23>; // 7 x i1 vector value -def v8i1 : VTVec<8, i1, 24>; // 8 x i1 vector value -def v16i1 : VTVec<16, i1, 25>; // 16 x i1 vector value -def v32i1 : VTVec<32, i1, 26>; // 32 x i1 vector value -def v64i1 : VTVec<64, i1, 27>; // 64 x i1 vector value -def v128i1 : VTVec<128, i1, 28>; // 128 x i1 vector value -def v256i1 : VTVec<256, i1, 29>; // 256 x i1 vector value -def v512i1 : VTVec<512, i1, 30>; // 512 x i1 vector value -def v1024i1 : VTVec<1024, i1, 31>; // 1024 x i1 vector value -def v2048i1 : VTVec<2048, i1, 32>; // 2048 x i1 vector value -def v4096i1 : VTVec<4096, i1, 33>; // 4096 x i1 vector value - -def v128i2 : VTVec<128, i2, 34>; // 128 x i2 vector value -def v256i2 : VTVec<256, i2, 35>; // 256 x i2 vector value - -def v64i4 : VTVec<64, i4, 36>; // 64 x i4 vector value -def v128i4 : VTVec<128, i4, 37>; // 128 x i4 vector value - -def v1i8 : VTVec<1, i8, 38>; // 1 x i8 vector value -def v2i8 : VTVec<2, i8, 39>; // 2 x i8 vector value -def v3i8 : VTVec<3, i8, 40>; // 3 x i8 vector value -def v4i8 : VTVec<4, i8, 41>; // 4 x i8 vector value -def v5i8 : VTVec<5, i8, 42>; // 5 x i8 vector value -def v6i8 : VTVec<6, i8, 43>; // 6 x i8 vector value -def v7i8 : VTVec<7, i8, 44>; // 7 x i8 vector value -def v8i8 : VTVec<8, i8, 45>; // 8 x i8 vector value -def v16i8 : VTVec<16, i8, 46>; // 16 x i8 vector value -def v32i8 : VTVec<32, i8, 47>; // 32 x i8 vector value -def v64i8 : VTVec<64, i8, 48>; // 64 x i8 vector value -def v128i8 : VTVec<128, i8, 49>; // 128 x i8 vector value -def v256i8 : VTVec<256, i8, 50>; // 256 x i8 vector value -def v512i8 : VTVec<512, i8, 51>; // 512 x i8 vector value -def v1024i8 : VTVec<1024, i8, 52>; // 1024 x i8 vector value - -def v1i16 : VTVec<1, i16, 53>; // 1 x i16 vector value -def v2i16 : VTVec<2, i16, 54>; // 2 x i16 vector value -def v3i16 : VTVec<3, i16, 55>; // 3 x i16 vector value -def v4i16 : VTVec<4, i16, 56>; // 4 x i16 vector value -def v5i16 : VTVec<5, i16, 57>; // 5 x i16 vector value -def v6i16 : VTVec<6, i16, 58>; // 6 x i16 vector value -def v7i16 : VTVec<7, i16, 59>; // 7 x i16 vector value -def v8i16 : VTVec<8, i16, 60>; // 8 x i16 vector value -def v16i16 : VTVec<16, i16, 61>; // 16 x i16 vector value -def v32i16 : VTVec<32, i16, 62>; // 32 x i16 vector value -def v64i16 : VTVec<64, i16, 63>; // 64 x i16 vector value -def v128i16 : VTVec<128, i16, 64>; // 128 x i16 vector value -def v256i16 : VTVec<256, i16, 65>; // 256 x i16 vector value -def v512i16 : VTVec<512, i16, 66>; // 512 x i16 vector value -def v4096i16 : VTVec<4096, i16, 67>; // 4096 x i16 vector value - -def v1i32 : VTVec<1, i32, 68>; // 1 x i32 vector value -def v2i32 : VTVec<2, i32, 69>; // 2 x i32 vector value -def v3i32 : VTVec<3, i32, 70>; // 3 x i32 vector value -def v4i32 : VTVec<4, i32, 71>; // 4 x i32 vector value -def v5i32 : VTVec<5, i32, 72>; // 5 x i32 vector value -def v6i32 : VTVec<6, i32, 73>; // 6 x i32 vector value -def v7i32 : VTVec<7, i32, 74>; // 7 x i32 vector value -def v8i32 : VTVec<8, i32, 75>; // 8 x i32 vector value -def v9i32 : VTVec<9, i32, 76>; // 9 x i32 vector value -def v10i32 : VTVec<10, i32, 77>; // 10 x i32 vector value -def v11i32 : VTVec<11, i32, 78>; // 11 x i32 vector value -def v12i32 : VTVec<12, i32, 79>; // 12 x i32 vector value -def v16i32 : VTVec<16, i32, 80>; // 16 x i32 vector value -def v32i32 : VTVec<32, i32, 81>; // 32 x i32 vector value -def v64i32 : VTVec<64, i32, 82>; // 64 x i32 vector value -def v128i32 : VTVec<128, i32, 83>; // 128 x i32 vector value -def v256i32 : VTVec<256, i32, 84>; // 256 x i32 vector value -def v512i32 : VTVec<512, i32, 85>; // 512 x i32 vector value -def v1024i32 : VTVec<1024, i32, 86>; // 1024 x i32 vector value -def v2048i32 : VTVec<2048, i32, 87>; // 2048 x i32 vector value -def v4096i32 : VTVec<4096, i32, 88>; // 4096 x i32 vector value - -def v1i64 : VTVec<1, i64, 89>; // 1 x i64 vector value -def v2i64 : VTVec<2, i64, 90>; // 2 x i64 vector value -def v3i64 : VTVec<3, i64, 91>; // 3 x i64 vector value -def v4i64 : VTVec<4, i64, 92>; // 4 x i64 vector value -def v8i64 : VTVec<8, i64, 93>; // 8 x i64 vector value -def v16i64 : VTVec<16, i64, 94>; // 16 x i64 vector value -def v32i64 : VTVec<32, i64, 95>; // 32 x i64 vector value -def v64i64 : VTVec<64, i64, 96>; // 64 x i64 vector value -def v128i64 : VTVec<128, i64, 97>; // 128 x i64 vector value -def v256i64 : VTVec<256, i64, 98>; // 256 x i64 vector value - -def v1i128 : VTVec<1, i128, 99>; // 1 x i128 vector value - -def v1f16 : VTVec<1, f16, 100>; // 1 x f16 vector value -def v2f16 : VTVec<2, f16, 101>; // 2 x f16 vector value -def v3f16 : VTVec<3, f16, 102>; // 3 x f16 vector value -def v4f16 : VTVec<4, f16, 103>; // 4 x f16 vector value -def v5f16 : VTVec<5, f16, 104>; // 5 x f16 vector value -def v6f16 : VTVec<6, f16, 105>; // 6 x f16 vector value -def v7f16 : VTVec<7, f16, 106>; // 7 x f16 vector value -def v8f16 : VTVec<8, f16, 107>; // 8 x f16 vector value -def v16f16 : VTVec<16, f16, 108>; // 16 x f16 vector value -def v32f16 : VTVec<32, f16, 109>; // 32 x f16 vector value -def v64f16 : VTVec<64, f16, 110>; // 64 x f16 vector value -def v128f16 : VTVec<128, f16, 111>; // 128 x f16 vector value -def v256f16 : VTVec<256, f16, 112>; // 256 x f16 vector value -def v512f16 : VTVec<512, f16, 113>; // 512 x f16 vector value -def v4096f16 : VTVec<4096, f16, 114>; // 4096 x f16 vector value - -def v1bf16 : VTVec<1, bf16, 115>; // 1 x bf16 vector value -def v2bf16 : VTVec<2, bf16, 116>; // 2 x bf16 vector value -def v3bf16 : VTVec<3, bf16, 117>; // 3 x bf16 vector value -def v4bf16 : VTVec<4, bf16, 118>; // 4 x bf16 vector value -def v8bf16 : VTVec<8, bf16, 119>; // 8 x bf16 vector value -def v16bf16 : VTVec<16, bf16, 120>; // 16 x bf16 vector value -def v32bf16 : VTVec<32, bf16, 121>; // 32 x bf16 vector value -def v64bf16 : VTVec<64, bf16, 122>; // 64 x bf16 vector value -def v128bf16 : VTVec<128, bf16, 123>; // 128 x bf16 vector value -def v4096bf16 : VTVec<4096, bf16, 124>; // 4096 x bf16 vector value - -def v1f32 : VTVec<1, f32, 125>; // 1 x f32 vector value -def v2f32 : VTVec<2, f32, 126>; // 2 x f32 vector value -def v3f32 : VTVec<3, f32, 127>; // 3 x f32 vector value -def v4f32 : VTVec<4, f32, 128>; // 4 x f32 vector value -def v5f32 : VTVec<5, f32, 129>; // 5 x f32 vector value -def v6f32 : VTVec<6, f32, 130>; // 6 x f32 vector value -def v7f32 : VTVec<7, f32, 131>; // 7 x f32 vector value -def v8f32 : VTVec<8, f32, 132>; // 8 x f32 vector value -def v9f32 : VTVec<9, f32, 133>; // 9 x f32 vector value -def v10f32 : VTVec<10, f32, 134>; // 10 x f32 vector value -def v11f32 : VTVec<11, f32, 135>; // 11 x f32 vector value -def v12f32 : VTVec<12, f32, 136>; // 12 x f32 vector value -def v16f32 : VTVec<16, f32, 137>; // 16 x f32 vector value -def v32f32 : VTVec<32, f32, 138>; // 32 x f32 vector value -def v64f32 : VTVec<64, f32, 139>; // 64 x f32 vector value -def v128f32 : VTVec<128, f32, 140>; // 128 x f32 vector value -def v256f32 : VTVec<256, f32, 141>; // 256 x f32 vector value -def v512f32 : VTVec<512, f32, 142>; // 512 x f32 vector value -def v1024f32 : VTVec<1024, f32, 143>; // 1024 x f32 vector value -def v2048f32 : VTVec<2048, f32, 144>; // 2048 x f32 vector value - -def v1f64 : VTVec<1, f64, 145>; // 1 x f64 vector value -def v2f64 : VTVec<2, f64, 146>; // 2 x f64 vector value -def v3f64 : VTVec<3, f64, 147>; // 3 x f64 vector value -def v4f64 : VTVec<4, f64, 148>; // 4 x f64 vector value -def v8f64 : VTVec<8, f64, 149>; // 8 x f64 vector value -def v16f64 : VTVec<16, f64, 150>; // 16 x f64 vector value -def v32f64 : VTVec<32, f64, 151>; // 32 x f64 vector value -def v64f64 : VTVec<64, f64, 152>; // 64 x f64 vector value -def v128f64 : VTVec<128, f64, 153>; // 128 x f64 vector value -def v256f64 : VTVec<256, f64, 154>; // 256 x f64 vector value - -def nxv1i1 : VTScalableVec<1, i1, 155>; // n x 1 x i1 vector value -def nxv2i1 : VTScalableVec<2, i1, 156>; // n x 2 x i1 vector value -def nxv4i1 : VTScalableVec<4, i1, 157>; // n x 4 x i1 vector value -def nxv8i1 : VTScalableVec<8, i1, 158>; // n x 8 x i1 vector value -def nxv16i1 : VTScalableVec<16, i1, 159>; // n x 16 x i1 vector value -def nxv32i1 : VTScalableVec<32, i1, 160>; // n x 32 x i1 vector value -def nxv64i1 : VTScalableVec<64, i1, 161>; // n x 64 x i1 vector value - -def nxv1i8 : VTScalableVec<1, i8, 162>; // n x 1 x i8 vector value -def nxv2i8 : VTScalableVec<2, i8, 163>; // n x 2 x i8 vector value -def nxv4i8 : VTScalableVec<4, i8, 164>; // n x 4 x i8 vector value -def nxv8i8 : VTScalableVec<8, i8, 165>; // n x 8 x i8 vector value -def nxv16i8 : VTScalableVec<16, i8, 166>; // n x 16 x i8 vector value -def nxv32i8 : VTScalableVec<32, i8, 167>; // n x 32 x i8 vector value -def nxv64i8 : VTScalableVec<64, i8, 168>; // n x 64 x i8 vector value - -def nxv1i16 : VTScalableVec<1, i16, 169>; // n x 1 x i16 vector value -def nxv2i16 : VTScalableVec<2, i16, 170>; // n x 2 x i16 vector value -def nxv4i16 : VTScalableVec<4, i16, 171>; // n x 4 x i16 vector value -def nxv8i16 : VTScalableVec<8, i16, 172>; // n x 8 x i16 vector value -def nxv16i16 : VTScalableVec<16, i16, 173>; // n x 16 x i16 vector value -def nxv32i16 : VTScalableVec<32, i16, 174>; // n x 32 x i16 vector value - -def nxv1i32 : VTScalableVec<1, i32, 175>; // n x 1 x i32 vector value -def nxv2i32 : VTScalableVec<2, i32, 176>; // n x 2 x i32 vector value -def nxv4i32 : VTScalableVec<4, i32, 177>; // n x 4 x i32 vector value -def nxv8i32 : VTScalableVec<8, i32, 178>; // n x 8 x i32 vector value -def nxv16i32 : VTScalableVec<16, i32, 179>; // n x 16 x i32 vector value -def nxv32i32 : VTScalableVec<32, i32, 180>; // n x 32 x i32 vector value - -def nxv1i64 : VTScalableVec<1, i64, 181>; // n x 1 x i64 vector value -def nxv2i64 : VTScalableVec<2, i64, 182>; // n x 2 x i64 vector value -def nxv4i64 : VTScalableVec<4, i64, 183>; // n x 4 x i64 vector value -def nxv8i64 : VTScalableVec<8, i64, 184>; // n x 8 x i64 vector value -def nxv16i64 : VTScalableVec<16, i64, 185>; // n x 16 x i64 vector value -def nxv32i64 : VTScalableVec<32, i64, 186>; // n x 32 x i64 vector value - -def nxv1f16 : VTScalableVec<1, f16, 187>; // n x 1 x f16 vector value -def nxv2f16 : VTScalableVec<2, f16, 188>; // n x 2 x f16 vector value -def nxv4f16 : VTScalableVec<4, f16, 189>; // n x 4 x f16 vector value -def nxv8f16 : VTScalableVec<8, f16, 190>; // n x 8 x f16 vector value -def nxv16f16 : VTScalableVec<16, f16, 191>; // n x 16 x f16 vector value -def nxv32f16 : VTScalableVec<32, f16, 192>; // n x 32 x f16 vector value - -def nxv1bf16 : VTScalableVec<1, bf16, 193>; // n x 1 x bf16 vector value -def nxv2bf16 : VTScalableVec<2, bf16, 194>; // n x 2 x bf16 vector value -def nxv4bf16 : VTScalableVec<4, bf16, 195>; // n x 4 x bf16 vector value -def nxv8bf16 : VTScalableVec<8, bf16, 196>; // n x 8 x bf16 vector value -def nxv16bf16 : VTScalableVec<16, bf16, 197>; // n x 16 x bf16 vector value -def nxv32bf16 : VTScalableVec<32, bf16, 198>; // n x 32 x bf16 vector value - -def nxv1f32 : VTScalableVec<1, f32, 199>; // n x 1 x f32 vector value -def nxv2f32 : VTScalableVec<2, f32, 200>; // n x 2 x f32 vector value -def nxv4f32 : VTScalableVec<4, f32, 201>; // n x 4 x f32 vector value -def nxv8f32 : VTScalableVec<8, f32, 202>; // n x 8 x f32 vector value -def nxv16f32 : VTScalableVec<16, f32, 203>; // n x 16 x f32 vector value - -def nxv1f64 : VTScalableVec<1, f64, 204>; // n x 1 x f64 vector value -def nxv2f64 : VTScalableVec<2, f64, 205>; // n x 2 x f64 vector value -def nxv4f64 : VTScalableVec<4, f64, 206>; // n x 4 x f64 vector value -def nxv8f64 : VTScalableVec<8, f64, 207>; // n x 8 x f64 vector value +def i256 : VTInt<256, 10>; // 256-bit integer value +def i512 : VTInt<512, 11>; // 512-bit integer value + +def bf16 : VTFP<16, 12>; // 16-bit brain floating point value +def f16 : VTFP<16, 13>; // 16-bit floating point value +def f32 : VTFP<32, 14>; // 32-bit floating point value +def f64 : VTFP<64, 15>; // 64-bit floating point value +def f80 : VTFP<80, 16>; // 80-bit floating point value +def f128 : VTFP<128, 17>; // 128-bit floating point value +def ppcf128 : VTFP<128, 18>; // PPC 128-bit floating point value + +def v1i1 : VTVec<1, i1, 19>; // 1 x i1 vector value +def v2i1 : VTVec<2, i1, 20>; // 2 x i1 vector value +def v3i1 : VTVec<3, i1, 21>; // 3 x i1 vector value +def v4i1 : VTVec<4, i1, 22>; // 4 x i1 vector value +def v5i1 : VTVec<5, i1, 23>; // 5 x i1 vector value +def v6i1 : VTVec<6, i1, 24>; // 6 x i1 vector value +def v7i1 : VTVec<7, i1, 25>; // 7 x i1 vector value +def v8i1 : VTVec<8, i1, 26>; // 8 x i1 vector value +def v16i1 : VTVec<16, i1, 27>; // 16 x i1 vector value +def v32i1 : VTVec<32, i1, 28>; // 32 x i1 vector value +def v64i1 : VTVec<64, i1, 29>; // 64 x i1 vector value +def v128i1 : VTVec<128, i1, 30>; // 128 x i1 vector value +def v256i1 : VTVec<256, i1, 31>; // 256 x i1 vector value +def v512i1 : VTVec<512, i1, 32>; // 512 x i1 vector value +def v1024i1 : VTVec<1024, i1, 33>; // 1024 x i1 vector value +def v2048i1 : VTVec<2048, i1, 34>; // 2048 x i1 vector value +def v4096i1 : VTVec<4096, i1, 35>; // 4096 x i1 vector value + +def v128i2 : VTVec<128, i2, 36>; // 128 x i2 vector value +def v256i2 : VTVec<256, i2, 37>; // 256 x i2 vector value + +def v64i4 : VTVec<64, i4, 38>; // 64 x i4 vector value +def v128i4 : VTVec<128, i4, 39>; // 128 x i4 vector value + +def v1i8 : VTVec<1, i8, 40>; // 1 x i8 vector value +def v2i8 : VTVec<2, i8, 41>; // 2 x i8 vector value +def v3i8 : VTVec<3, i8, 42>; // 3 x i8 vector value +def v4i8 : VTVec<4, i8, 43>; // 4 x i8 vector value +def v5i8 : VTVec<5, i8, 44>; // 5 x i8 vector value +def v6i8 : VTVec<6, i8, 45>; // 6 x i8 vector value +def v7i8 : VTVec<7, i8, 46>; // 7 x i8 vector value +def v8i8 : VTVec<8, i8, 47>; // 8 x i8 vector value +def v16i8 : VTVec<16, i8, 48>; // 16 x i8 vector value +def v32i8 : VTVec<32, i8, 49>; // 32 x i8 vector value +def v64i8 : VTVec<64, i8, 50>; // 64 x i8 vector value +def v128i8 : VTVec<128, i8, 51>; // 128 x i8 vector value +def v256i8 : VTVec<256, i8, 52>; // 256 x i8 vector value +def v512i8 : VTVec<512, i8, 53>; // 512 x i8 vector value +def v1024i8 : VTVec<1024, i8, 54>; // 1024 x i8 vector value + +def v1i16 : VTVec<1, i16, 55>; // 1 x i16 vector value +def v2i16 : VTVec<2, i16, 56>; // 2 x i16 vector value +def v3i16 : VTVec<3, i16, 57>; // 3 x i16 vector value +def v4i16 : VTVec<4, i16, 58>; // 4 x i16 vector value +def v5i16 : VTVec<5, i16, 59>; // 5 x i16 vector value +def v6i16 : VTVec<6, i16, 60>; // 6 x i16 vector value +def v7i16 : VTVec<7, i16, 61>; // 7 x i16 vector value +def v8i16 : VTVec<8, i16, 62>; // 8 x i16 vector value +def v16i16 : VTVec<16, i16, 63>; // 16 x i16 vector value +def v32i16 : VTVec<32, i16, 64>; // 32 x i16 vector value +def v64i16 : VTVec<64, i16, 65>; // 64 x i16 vector value +def v128i16 : VTVec<128, i16, 66>; // 128 x i16 vector value +def v256i16 : VTVec<256, i16, 67>; // 256 x i16 vector value +def v512i16 : VTVec<512, i16, 68>; // 512 x i16 vector value +def v4096i16 : VTVec<4096, i16, 69>; // 4096 x i16 vector value + +def v1i32 : VTVec<1, i32, 70>; // 1 x i32 vector value +def v2i32 : VTVec<2, i32, 71>; // 2 x i32 vector value +def v3i32 : VTVec<3, i32, 72>; // 3 x i32 vector value +def v4i32 : VTVec<4, i32, 73>; // 4 x i32 vector value +def v5i32 : VTVec<5, i32, 74>; // 5 x i32 vector value +def v6i32 : VTVec<6, i32, 75>; // 6 x i32 vector value +def v7i32 : VTVec<7, i32, 76>; // 7 x i32 vector value +def v8i32 : VTVec<8, i32, 77>; // 8 x i32 vector value +def v9i32 : VTVec<9, i32, 78>; // 9 x i32 vector value +def v10i32 : VTVec<10, i32, 79>; // 10 x i32 vector value +def v11i32 : VTVec<11, i32, 80>; // 11 x i32 vector value +def v12i32 : VTVec<12, i32, 81>; // 12 x i32 vector value +def v16i32 : VTVec<16, i32, 82>; // 16 x i32 vector value +def v32i32 : VTVec<32, i32, 83>; // 32 x i32 vector value +def v64i32 : VTVec<64, i32, 84>; // 64 x i32 vector value +def v128i32 : VTVec<128, i32, 85>; // 128 x i32 vector value +def v256i32 : VTVec<256, i32, 86>; // 256 x i32 vector value +def v512i32 : VTVec<512, i32, 87>; // 512 x i32 vector value +def v1024i32 : VTVec<1024, i32, 88>; // 1024 x i32 vector value +def v2048i32 : VTVec<2048, i32, 89>; // 2048 x i32 vector value +def v4096i32 : VTVec<4096, i32, 90>; // 4096 x i32 vector value + +def v1i64 : VTVec<1, i64, 91>; // 1 x i64 vector value +def v2i64 : VTVec<2, i64, 92>; // 2 x i64 vector value +def v3i64 : VTVec<3, i64, 93>; // 3 x i64 vector value +def v4i64 : VTVec<4, i64, 94>; // 4 x i64 vector value +def v8i64 : VTVec<8, i64, 95>; // 8 x i64 vector value +def v16i64 : VTVec<16, i64, 96>; // 16 x i64 vector value +def v32i64 : VTVec<32, i64, 97>; // 32 x i64 vector value +def v64i64 : VTVec<64, i64, 98>; // 64 x i64 vector value +def v128i64 : VTVec<128, i64, 99>; // 128 x i64 vector value +def v256i64 : VTVec<256, i64, 100>; // 256 x i64 vector value + +def v1i128 : VTVec<1, i128, 101>; // 1 x i128 vector value + +def v1f16 : VTVec<1, f16, 102>; // 1 x f16 vector value +def v2f16 : VTVec<2, f16, 103>; // 2 x f16 vector value +def v3f16 : VTVec<3, f16, 104>; // 3 x f16 vector value +def v4f16 : VTVec<4, f16, 105>; // 4 x f16 vector value +def v5f16 : VTVec<5, f16, 106>; // 5 x f16 vector value +def v6f16 : VTVec<6, f16, 107>; // 6 x f16 vector value +def v7f16 : VTVec<7, f16, 108>; // 7 x f16 vector value +def v8f16 : VTVec<8, f16, 109>; // 8 x f16 vector value +def v16f16 : VTVec<16, f16, 110>; // 16 x f16 vector value +def v32f16 : VTVec<32, f16, 111>; // 32 x f16 vector value +def v64f16 : VTVec<64, f16, 112>; // 64 x f16 vector value +def v128f16 : VTVec<128, f16, 113>; // 128 x f16 vector value +def v256f16 : VTVec<256, f16, 114>; // 256 x f16 vector value +def v512f16 : VTVec<512, f16, 115>; // 512 x f16 vector value +def v4096f16 : VTVec<4096, f16, 116>; // 4096 x f16 vector value + +def v1bf16 : VTVec<1, bf16, 117>; // 1 x bf16 vector value +def v2bf16 : VTVec<2, bf16, 118>; // 2 x bf16 vector value +def v3bf16 : VTVec<3, bf16, 119>; // 3 x bf16 vector value +def v4bf16 : VTVec<4, bf16, 120>; // 4 x bf16 vector value +def v8bf16 : VTVec<8, bf16, 121>; // 8 x bf16 vector value +def v16bf16 : VTVec<16, bf16, 122>; // 16 x bf16 vector value +def v32bf16 : VTVec<32, bf16, 123>; // 32 x bf16 vector value +def v64bf16 : VTVec<64, bf16, 124>; // 64 x bf16 vector value +def v128bf16 : VTVec<128, bf16, 125>; // 128 x bf16 vector value +def v4096bf16 : VTVec<4096, bf16, 126>; // 4096 x bf16 vector value + +def v1f32 : VTVec<1, f32, 127>; // 1 x f32 vector value +def v2f32 : VTVec<2, f32, 128>; // 2 x f32 vector value +def v3f32 : VTVec<3, f32, 129>; // 3 x f32 vector value +def v4f32 : VTVec<4, f32, 130>; // 4 x f32 vector value +def v5f32 : VTVec<5, f32, 131>; // 5 x f32 vector value +def v6f32 : VTVec<6, f32, 132>; // 6 x f32 vector value +def v7f32 : VTVec<7, f32, 133>; // 7 x f32 vector value +def v8f32 : VTVec<8, f32, 134>; // 8 x f32 vector value +def v9f32 : VTVec<9, f32, 135>; // 9 x f32 vector value +def v10f32 : VTVec<10, f32, 136>; // 10 x f32 vector value +def v11f32 : VTVec<11, f32, 137>; // 11 x f32 vector value +def v12f32 : VTVec<12, f32, 138>; // 12 x f32 vector value +def v16f32 : VTVec<16, f32, 139>; // 16 x f32 vector value +def v32f32 : VTVec<32, f32, 140>; // 32 x f32 vector value +def v64f32 : VTVec<64, f32, 141>; // 64 x f32 vector value +def v128f32 : VTVec<128, f32, 142>; // 128 x f32 vector value +def v256f32 : VTVec<256, f32, 143>; // 256 x f32 vector value +def v512f32 : VTVec<512, f32, 144>; // 512 x f32 vector value +def v1024f32 : VTVec<1024, f32, 145>; // 1024 x f32 vector value +def v2048f32 : VTVec<2048, f32, 146>; // 2048 x f32 vector value + +def v1f64 : VTVec<1, f64, 147>; // 1 x f64 vector value +def v2f64 : VTVec<2, f64, 148>; // 2 x f64 vector value +def v3f64 : VTVec<3, f64, 149>; // 3 x f64 vector value +def v4f64 : VTVec<4, f64, 150>; // 4 x f64 vector value +def v8f64 : VTVec<8, f64, 151>; // 8 x f64 vector value +def v16f64 : VTVec<16, f64, 152>; // 16 x f64 vector value +def v32f64 : VTVec<32, f64, 153>; // 32 x f64 vector value +def v64f64 : VTVec<64, f64, 154>; // 64 x f64 vector value +def v128f64 : VTVec<128, f64, 155>; // 128 x f64 vector value +def v256f64 : VTVec<256, f64, 156>; // 256 x f64 vector value + +def nxv1i1 : VTScalableVec<1, i1, 157>; // n x 1 x i1 vector value +def nxv2i1 : VTScalableVec<2, i1, 158>; // n x 2 x i1 vector value +def nxv4i1 : VTScalableVec<4, i1, 159>; // n x 4 x i1 vector value +def nxv8i1 : VTScalableVec<8, i1, 160>; // n x 8 x i1 vector value +def nxv16i1 : VTScalableVec<16, i1, 161>; // n x 16 x i1 vector value +def nxv32i1 : VTScalableVec<32, i1, 162>; // n x 32 x i1 vector value +def nxv64i1 : VTScalableVec<64, i1, 163>; // n x 64 x i1 vector value + +def nxv1i8 : VTScalableVec<1, i8, 164>; // n x 1 x i8 vector value +def nxv2i8 : VTScalableVec<2, i8, 165>; // n x 2 x i8 vector value +def nxv4i8 : VTScalableVec<4, i8, 166>; // n x 4 x i8 vector value +def nxv8i8 : VTScalableVec<8, i8, 167>; // n x 8 x i8 vector value +def nxv16i8 : VTScalableVec<16, i8, 168>; // n x 16 x i8 vector value +def nxv32i8 : VTScalableVec<32, i8, 169>; // n x 32 x i8 vector value +def nxv64i8 : VTScalableVec<64, i8, 170>; // n x 64 x i8 vector value + +def nxv1i16 : VTScalableVec<1, i16, 171>; // n x 1 x i16 vector value +def nxv2i16 : VTScalableVec<2, i16, 172>; // n x 2 x i16 vector value +def nxv4i16 : VTScalableVec<4, i16, 173>; // n x 4 x i16 vector value +def nxv8i16 : VTScalableVec<8, i16, 174>; // n x 8 x i16 vector value +def nxv16i16 : VTScalableVec<16, i16, 175>; // n x 16 x i16 vector value +def nxv32i16 : VTScalableVec<32, i16, 176>; // n x 32 x i16 vector value + +def nxv1i32 : VTScalableVec<1, i32, 177>; // n x 1 x i32 vector value +def nxv2i32 : VTScalableVec<2, i32, 178>; // n x 2 x i32 vector value +def nxv4i32 : VTScalableVec<4, i32, 179>; // n x 4 x i32 vector value +def nxv8i32 : VTScalableVec<8, i32, 180>; // n x 8 x i32 vector value +def nxv16i32 : VTScalableVec<16, i32, 181>; // n x 16 x i32 vector value +def nxv32i32 : VTScalableVec<32, i32, 182>; // n x 32 x i32 vector value + +def nxv1i64 : VTScalableVec<1, i64, 183>; // n x 1 x i64 vector value +def nxv2i64 : VTScalableVec<2, i64, 184>; // n x 2 x i64 vector value +def nxv4i64 : VTScalableVec<4, i64, 185>; // n x 4 x i64 vector value +def nxv8i64 : VTScalableVec<8, i64, 186>; // n x 8 x i64 vector value +def nxv16i64 : VTScalableVec<16, i64, 187>; // n x 16 x i64 vector value +def nxv32i64 : VTScalableVec<32, i64, 188>; // n x 32 x i64 vector value + +def nxv1f16 : VTScalableVec<1, f16, 189>; // n x 1 x f16 vector value +def nxv2f16 : VTScalableVec<2, f16, 190>; // n x 2 x f16 vector value +def nxv4f16 : VTScalableVec<4, f16, 191>; // n x 4 x f16 vector value +def nxv8f16 : VTScalableVec<8, f16, 192>; // n x 8 x f16 vector value +def nxv16f16 : VTScalableVec<16, f16, 193>; // n x 16 x f16 vector value +def nxv32f16 : VTScalableVec<32, f16, 194>; // n x 32 x f16 vector value + +def nxv1bf16 : VTScalableVec<1, bf16, 195>; // n x 1 x bf16 vector value +def nxv2bf16 : VTScalableVec<2, bf16, 196>; // n x 2 x bf16 vector value +def nxv4bf16 : VTScalableVec<4, bf16, 197>; // n x 4 x bf16 vector value +def nxv8bf16 : VTScalableVec<8, bf16, 198>; // n x 8 x bf16 vector value +def nxv16bf16 : VTScalableVec<16, bf16, 199>; // n x 16 x bf16 vector value +def nxv32bf16 : VTScalableVec<32, bf16, 200>; // n x 32 x bf16 vector value + +def nxv1f32 : VTScalableVec<1, f32, 201>; // n x 1 x f32 vector value +def nxv2f32 : VTScalableVec<2, f32, 202>; // n x 2 x f32 vector value +def nxv4f32 : VTScalableVec<4, f32, 203>; // n x 4 x f32 vector value +def nxv8f32 : VTScalableVec<8, f32, 204>; // n x 8 x f32 vector value +def nxv16f32 : VTScalableVec<16, f32, 205>; // n x 16 x f32 vector value + +def nxv1f64 : VTScalableVec<1, f64, 206>; // n x 1 x f64 vector value +def nxv2f64 : VTScalableVec<2, f64, 207>; // n x 2 x f64 vector value +def nxv4f64 : VTScalableVec<4, f64, 208>; // n x 4 x f64 vector value +def nxv8f64 : VTScalableVec<8, f64, 209>; // n x 8 x f64 vector value // Sz = NF * MinNumElts * 8(bits) -def riscv_nxv1i8x2 : VTVecTup<16, 2, i8, 208>; // RISCV vector tuple(min_num_elts=1, nf=2) -def riscv_nxv1i8x3 : VTVecTup<24, 3, i8, 209>; // RISCV vector tuple(min_num_elts=1, nf=3) -def riscv_nxv1i8x4 : VTVecTup<32, 4, i8, 210>; // RISCV vector tuple(min_num_elts=1, nf=4) -def riscv_nxv1i8x5 : VTVecTup<40, 5, i8, 211>; // RISCV vector tuple(min_num_elts=1, nf=5) -def riscv_nxv1i8x6 : VTVecTup<48, 6, i8, 212>; // RISCV vector tuple(min_num_elts=1, nf=6) -def riscv_nxv1i8x7 : VTVecTup<56, 7, i8, 213>; // RISCV vector tuple(min_num_elts=1, nf=7) -def riscv_nxv1i8x8 : VTVecTup<64, 8, i8, 214>; // RISCV vector tuple(min_num_elts=1, nf=8) -def riscv_nxv2i8x2 : VTVecTup<32, 2, i8, 215>; // RISCV vector tuple(min_num_elts=2, nf=2) -def riscv_nxv2i8x3 : VTVecTup<48, 3, i8, 216>; // RISCV vector tuple(min_num_elts=2, nf=3) -def riscv_nxv2i8x4 : VTVecTup<64, 4, i8, 217>; // RISCV vector tuple(min_num_elts=2, nf=4) -def riscv_nxv2i8x5 : VTVecTup<80, 5, i8, 218>; // RISCV vector tuple(min_num_elts=2, nf=5) -def riscv_nxv2i8x6 : VTVecTup<96, 6, i8, 219>; // RISCV vector tuple(min_num_elts=2, nf=6) -def riscv_nxv2i8x7 : VTVecTup<112, 7, i8, 220>; // RISCV vector tuple(min_num_elts=2, nf=7) -def riscv_nxv2i8x8 : VTVecTup<128, 8, i8, 221>; // RISCV vector tuple(min_num_elts=2, nf=8) -def riscv_nxv4i8x2 : VTVecTup<64, 2, i8, 222>; // RISCV vector tuple(min_num_elts=4, nf=2) -def riscv_nxv4i8x3 : VTVecTup<96, 3, i8, 223>; // RISCV vector tuple(min_num_elts=4, nf=3) -def riscv_nxv4i8x4 : VTVecTup<128, 4, i8, 224>; // RISCV vector tuple(min_num_elts=4, nf=4) -def riscv_nxv4i8x5 : VTVecTup<160, 5, i8, 225>; // RISCV vector tuple(min_num_elts=4, nf=5) -def riscv_nxv4i8x6 : VTVecTup<192, 6, i8, 226>; // RISCV vector tuple(min_num_elts=4, nf=6) -def riscv_nxv4i8x7 : VTVecTup<224, 7, i8, 227>; // RISCV vector tuple(min_num_elts=4, nf=7) -def riscv_nxv4i8x8 : VTVecTup<256, 8, i8, 228>; // RISCV vector tuple(min_num_elts=4, nf=8) -def riscv_nxv8i8x2 : VTVecTup<128, 2, i8, 229>; // RISCV vector tuple(min_num_elts=8, nf=2) -def riscv_nxv8i8x3 : VTVecTup<192, 3, i8, 230>; // RISCV vector tuple(min_num_elts=8, nf=3) -def riscv_nxv8i8x4 : VTVecTup<256, 4, i8, 231>; // RISCV vector tuple(min_num_elts=8, nf=4) -def riscv_nxv8i8x5 : VTVecTup<320, 5, i8, 232>; // RISCV vector tuple(min_num_elts=8, nf=5) -def riscv_nxv8i8x6 : VTVecTup<384, 6, i8, 233>; // RISCV vector tuple(min_num_elts=8, nf=6) -def riscv_nxv8i8x7 : VTVecTup<448, 7, i8, 234>; // RISCV vector tuple(min_num_elts=8, nf=7) -def riscv_nxv8i8x8 : VTVecTup<512, 8, i8, 235>; // RISCV vector tuple(min_num_elts=8, nf=8) -def riscv_nxv16i8x2 : VTVecTup<256, 2, i8, 236>; // RISCV vector tuple(min_num_elts=16, nf=2) -def riscv_nxv16i8x3 : VTVecTup<384, 3, i8, 237>; // RISCV vector tuple(min_num_elts=16, nf=3) -def riscv_nxv16i8x4 : VTVecTup<512, 4, i8, 238>; // RISCV vector tuple(min_num_elts=16, nf=4) -def riscv_nxv32i8x2 : VTVecTup<512, 2, i8, 239>; // RISCV vector tuple(min_num_elts=32, nf=2) - -def x86mmx : ValueType<64, 240>; // X86 MMX value -def Glue : ValueType<0, 241>; // Pre-RA sched glue -def isVoid : ValueType<0, 242>; // Produces no value -def untyped : ValueType<8, 243> { // Produces an untyped value +def riscv_nxv1i8x2 : VTVecTup<16, 2, i8, 210>; // RISCV vector tuple(min_num_elts=1, nf=2) +def riscv_nxv1i8x3 : VTVecTup<24, 3, i8, 211>; // RISCV vector tuple(min_num_elts=1, nf=3) +def riscv_nxv1i8x4 : VTVecTup<32, 4, i8, 212>; // RISCV vector tuple(min_num_elts=1, nf=4) +def riscv_nxv1i8x5 : VTVecTup<40, 5, i8, 213>; // RISCV vector tuple(min_num_elts=1, nf=5) +def riscv_nxv1i8x6 : VTVecTup<48, 6, i8, 214>; // RISCV vector tuple(min_num_elts=1, nf=6) +def riscv_nxv1i8x7 : VTVecTup<56, 7, i8, 215>; // RISCV vector tuple(min_num_elts=1, nf=7) +def riscv_nxv1i8x8 : VTVecTup<64, 8, i8, 216>; // RISCV vector tuple(min_num_elts=1, nf=8) +def riscv_nxv2i8x2 : VTVecTup<32, 2, i8, 217>; // RISCV vector tuple(min_num_elts=2, nf=2) +def riscv_nxv2i8x3 : VTVecTup<48, 3, i8, 218>; // RISCV vector tuple(min_num_elts=2, nf=3) +def riscv_nxv2i8x4 : VTVecTup<64, 4, i8, 219>; // RISCV vector tuple(min_num_elts=2, nf=4) +def riscv_nxv2i8x5 : VTVecTup<80, 5, i8, 220>; // RISCV vector tuple(min_num_elts=2, nf=5) +def riscv_nxv2i8x6 : VTVecTup<96, 6, i8, 221>; // RISCV vector tuple(min_num_elts=2, nf=6) +def riscv_nxv2i8x7 : VTVecTup<112, 7, i8, 222>; // RISCV vector tuple(min_num_elts=2, nf=7) +def riscv_nxv2i8x8 : VTVecTup<128, 8, i8, 223>; // RISCV vector tuple(min_num_elts=2, nf=8) +def riscv_nxv4i8x2 : VTVecTup<64, 2, i8, 224>; // RISCV vector tuple(min_num_elts=4, nf=2) +def riscv_nxv4i8x3 : VTVecTup<96, 3, i8, 225>; // RISCV vector tuple(min_num_elts=4, nf=3) +def riscv_nxv4i8x4 : VTVecTup<128, 4, i8, 226>; // RISCV vector tuple(min_num_elts=4, nf=4) +def riscv_nxv4i8x5 : VTVecTup<160, 5, i8, 227>; // RISCV vector tuple(min_num_elts=4, nf=5) +def riscv_nxv4i8x6 : VTVecTup<192, 6, i8, 228>; // RISCV vector tuple(min_num_elts=4, nf=6) +def riscv_nxv4i8x7 : VTVecTup<224, 7, i8, 229>; // RISCV vector tuple(min_num_elts=4, nf=7) +def riscv_nxv4i8x8 : VTVecTup<256, 8, i8, 230>; // RISCV vector tuple(min_num_elts=4, nf=8) +def riscv_nxv8i8x2 : VTVecTup<128, 2, i8, 231>; // RISCV vector tuple(min_num_elts=8, nf=2) +def riscv_nxv8i8x3 : VTVecTup<192, 3, i8, 232>; // RISCV vector tuple(min_num_elts=8, nf=3) +def riscv_nxv8i8x4 : VTVecTup<256, 4, i8, 233>; // RISCV vector tuple(min_num_elts=8, nf=4) +def riscv_nxv8i8x5 : VTVecTup<320, 5, i8, 234>; // RISCV vector tuple(min_num_elts=8, nf=5) +def riscv_nxv8i8x6 : VTVecTup<384, 6, i8, 235>; // RISCV vector tuple(min_num_elts=8, nf=6) +def riscv_nxv8i8x7 : VTVecTup<448, 7, i8, 236>; // RISCV vector tuple(min_num_elts=8, nf=7) +def riscv_nxv8i8x8 : VTVecTup<512, 8, i8, 237>; // RISCV vector tuple(min_num_elts=8, nf=8) +def riscv_nxv16i8x2 : VTVecTup<256, 2, i8, 238>; // RISCV vector tuple(min_num_elts=16, nf=2) +def riscv_nxv16i8x3 : VTVecTup<384, 3, i8, 239>; // RISCV vector tuple(min_num_elts=16, nf=3) +def riscv_nxv16i8x4 : VTVecTup<512, 4, i8, 240>; // RISCV vector tuple(min_num_elts=16, nf=4) +def riscv_nxv32i8x2 : VTVecTup<512, 2, i8, 241>; // RISCV vector tuple(min_num_elts=32, nf=2) + +def x86mmx : ValueType<64, 242>; // X86 MMX value +def Glue : ValueType<0, 243>; // Pre-RA sched glue +def isVoid : ValueType<0, 244>; // Produces no value +def untyped : ValueType<8, 245> { // Produces an untyped value let LLVMName = "Untyped"; } -def funcref : ValueType<0, 244>; // WebAssembly's funcref type -def externref : ValueType<0, 245>; // WebAssembly's externref type -def exnref : ValueType<0, 246>; // WebAssembly's exnref type -def x86amx : ValueType<8192, 247>; // X86 AMX value -def i64x8 : ValueType<512, 248>; // 8 Consecutive GPRs (AArch64) +def funcref : ValueType<0, 246>; // WebAssembly's funcref type +def externref : ValueType<0, 247>; // WebAssembly's externref type +def exnref : ValueType<0, 248>; // WebAssembly's exnref type +def x86amx : ValueType<8192, 249>; // X86 AMX value +def i64x8 : ValueType<512, 250>; // 8 Consecutive GPRs (AArch64) def aarch64svcount - : ValueType<16, 249>; // AArch64 predicate-as-counter -def spirvbuiltin : ValueType<0, 250>; // SPIR-V's builtin type + : ValueType<16, 251>; // AArch64 predicate-as-counter +def spirvbuiltin : ValueType<0, 252>; // SPIR-V's builtin type // AMDGPU buffer fat pointer, buffer rsrc + offset, rewritten before MIR translation. // FIXME: Remove this and the getPointerType() override if MVT::i160 is added. -def amdgpuBufferFatPointer : ValueType<160, 251>; +def amdgpuBufferFatPointer : ValueType<160, 253>; // AMDGPU buffer strided pointer, buffer rsrc + index + offset, doesn't reach MIR. // FIXME: Remove this and the getPointerType() override if MVT::i82 is added. -def amdgpuBufferStridedPointer : ValueType<192, 252>; +def amdgpuBufferStridedPointer : ValueType<192, 254>; -def aarch64mfp8 : ValueType<8, 253>; // 8-bit value in FPR (AArch64) +def aarch64mfp8 : ValueType<8, 255>; // 8-bit value in FPR (AArch64) // CHERI capabilities. Pointer-like values that carry additional metadata // for enforcing safety guarantees on CHERI-enabled targets. -def c64 : VTCheriCapability<64, 254>; // 64-bit CHERI capability value -def c128 : VTCheriCapability<128, 255>; // 128-bit CHERI capability value +def c64 : VTCheriCapability<64, 256>; // 64-bit CHERI capability value +def c128 : VTCheriCapability<128, 257>; // 128-bit CHERI capability value let isNormalValueType = false in { // Pseudo valuetype mapped to the current CHERI capability pointer size. diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index a77eb0240e677..df16e0b4cfa91 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -5520,7 +5520,8 @@ static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // Unfortunately, we can't do this in the legalizer because there is no // way to setOperationAction for an non-simple type. StoreSDNode *ST = cast(N); - if (!ST->getValue().getValueType().isSimple()) + if (!ST->getValue().getValueType().isSimple() || + ST->getValue().getScalarValueSizeInBits() >= 256) return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI); } @@ -5533,7 +5534,8 @@ static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // Here is our chance to custom lower a load with a non-simple type. // Unfortunately, we can't do this in the legalizer because there is no // way to setOperationAction for an non-simple type. - if (!N->getValueType(0).isSimple()) + if (!N->getValueType(0).isSimple() || + N->getValueType(0).getScalarSizeInBits() >= 256) return lowerLoadVector(N, DCI.DAG, STI); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 81da77b88bfa0..5f93d655568d5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2151,6 +2151,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (Subtarget.hasCDI()) { + for (auto VT : {MVT::i256, MVT::i512}) { + if (VT == MVT::i512 && !Subtarget.useAVX512Regs()) + continue; + setOperationAction(ISD::CTLZ, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); + } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::CTLZ, VT, Legal); } @@ -2654,10 +2662,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::AVGCEILU, ISD::AVGFLOORS, ISD::AVGFLOORU, - ISD::CTLZ, - ISD::CTTZ, - ISD::CTLZ_ZERO_UNDEF, - ISD::CTTZ_ZERO_UNDEF, ISD::BITREVERSE, ISD::ADD, ISD::FADD, @@ -33818,6 +33822,59 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::CTLZ: + case ISD::CTTZ: + case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_UNDEF: { + // Fold i256/i512 CTLZ/CTTZ patterns to make use of AVX512 + // vXi64 CTLZ/CTTZ and VECTOR_COMPRESS. + // Compute the CTLZ/CTTZ of each element, add the element's bit offset, + // compress the result to remove all zero elements (passthru is set to + // scalar bitwidth if all elements are zero) and extract the lowest + // compressed element. + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + assert(Subtarget.hasCDI() && "AVX512CD required"); + assert((VT == MVT::i256 || VT == MVT::i512) && "Unexpected VT!"); + if (VT == MVT::i256 && !X86::mayFoldLoad(N0, Subtarget)) + return; + + unsigned SizeInBits = VT.getSizeInBits(); + MVT VecVT = MVT::getVectorVT(MVT::i64, SizeInBits / 64); + MVT BoolVT = VecVT.changeVectorElementType(MVT::i1); + SDValue Vec = DAG.getBitcast(VecVT, N0); + + SmallVector RevMask; + SmallVector Offsets; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I != E; ++I) { + RevMask.push_back((int)((E - 1) - I)); + Offsets.push_back(DAG.getConstant(I * 64, dl, MVT::i64)); + } + + // CTLZ - reverse the elements as we want the top non-zero element at the + // bottom for compression. + unsigned VecOpc = ISD::CTTZ; + if (Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF) { + VecOpc = ISD::CTLZ; + Vec = DAG.getVectorShuffle(VecVT, dl, Vec, Vec, RevMask); + } + + SDValue PassThrough = DAG.getUNDEF(VecVT); + if (Opc == ISD::CTLZ || Opc == ISD::CTTZ) + PassThrough = DAG.getConstant(SizeInBits, dl, VecVT); + + SDValue IsNonZero = DAG.getSetCC(dl, BoolVT, Vec, + DAG.getConstant(0, dl, VecVT), ISD::SETNE); + SDValue Cnt = DAG.getNode(VecOpc, dl, VecVT, Vec); + Cnt = DAG.getNode(ISD::ADD, dl, VecVT, Cnt, + DAG.getBuildVector(VecVT, dl, Offsets)); + Cnt = DAG.getNode(ISD::VECTOR_COMPRESS, dl, VecVT, Cnt, IsNonZero, + PassThrough); + Cnt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cnt, + DAG.getVectorIdxConstant(0, dl)); + Results.push_back(DAG.getZExtOrTrunc(Cnt, dl, VT)); + return; + } case ISD::MUL: { EVT VT = N->getValueType(0); assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && @@ -55212,65 +55269,6 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, return combineFneg(N, DAG, DCI, Subtarget); } -// Fold i256/i512 CTLZ/CTTZ patterns to make use of AVX512 -// vXi64 CTLZ/CTTZ and VECTOR_COMPRESS. -// Compute the CTLZ/CTTZ of each element, add the element's bit offset, compress -// the result to remove all zero elements (passthru is set to scalar bitwidth if -// all elements are zero) and extract the lowest compressed element. -static SDValue combineCTZ(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - unsigned Opc = N->getOpcode(); - unsigned SizeInBits = VT.getSizeInBits(); - assert((Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF || Opc == ISD::CTTZ || - Opc == ISD::CTTZ_ZERO_UNDEF) && - "Unsupported bit count"); - - if (VT.isScalarInteger() && Subtarget.hasCDI() && - ((SizeInBits == 512 && Subtarget.useAVX512Regs()) || - (SizeInBits == 256 && Subtarget.hasVLX() && - X86::mayFoldLoad(N0, Subtarget)))) { - MVT VecVT = MVT::getVectorVT(MVT::i64, SizeInBits / 64); - MVT BoolVT = VecVT.changeVectorElementType(MVT::i1); - SDValue Vec = DAG.getBitcast(VecVT, N0); - SDLoc DL(N); - - SmallVector RevMask; - SmallVector Offsets; - for (unsigned I = 0, E = VecVT.getVectorNumElements(); I != E; ++I) { - RevMask.push_back((int)((E - 1) - I)); - Offsets.push_back(DAG.getConstant(I * 64, DL, MVT::i64)); - } - - // CTLZ - reverse the elements as we want the top non-zero element at the - // bottom for compression. - unsigned VecOpc = ISD::CTTZ; - if (Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF) { - VecOpc = ISD::CTLZ; - Vec = DAG.getVectorShuffle(VecVT, DL, Vec, Vec, RevMask); - } - - SDValue PassThrough = DAG.getUNDEF(VecVT); - if (Opc == ISD::CTLZ || Opc == ISD::CTTZ) - PassThrough = DAG.getConstant(SizeInBits, DL, VecVT); - - SDValue IsNonZero = DAG.getSetCC(DL, BoolVT, Vec, - DAG.getConstant(0, DL, VecVT), ISD::SETNE); - SDValue Cnt = DAG.getNode(VecOpc, DL, VecVT, Vec); - Cnt = DAG.getNode(ISD::ADD, DL, VecVT, Cnt, - DAG.getBuildVector(VecVT, DL, Offsets)); - Cnt = DAG.getNode(ISD::VECTOR_COMPRESS, DL, VecVT, Cnt, IsNonZero, - PassThrough); - Cnt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cnt, - DAG.getVectorIdxConstant(0, DL)); - return DAG.getZExtOrTrunc(Cnt, DL, VT); - } - - return SDValue(); -} - static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -60994,10 +60992,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); - case ISD::CTLZ: - case ISD::CTTZ: - case ISD::CTLZ_ZERO_UNDEF: - case ISD::CTTZ_ZERO_UNDEF:return combineCTZ(N, DAG, DCI, Subtarget); case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget); case ISD::AVGCEILS: case ISD::AVGCEILU: diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll index 22c4ad28059e4..5b3b27a3d61de 100644 --- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll +++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll @@ -718,8 +718,7 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind { ; AVX512-NEXT: vptestmq %ymm0, %ymm0, %k1 ; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] ; AVX512-NEXT: vpcompressq %ymm1, %ymm0 {%k1} -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a0 = load i256, ptr %p0 @@ -988,8 +987,7 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind { ; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] ; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1} -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a0 = load i512, ptr %p0 @@ -1251,113 +1249,51 @@ define i32 @test_ctlz_i1024(i1024 %a0) nounwind { ; ; AVX512-LABEL: test_ctlz_i1024: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 ; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq %r9, %r14 -; AVX512-NEXT: movq %r8, %r11 -; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512-NEXT: lzcntq %r12, %rcx -; AVX512-NEXT: lzcntq %r8, %r9 -; AVX512-NEXT: addl $64, %r9d -; AVX512-NEXT: testq %r12, %r12 -; AVX512-NEXT: cmovnel %ecx, %r9d -; AVX512-NEXT: lzcntq %r10, %rsi -; AVX512-NEXT: lzcntq %rax, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %esi, %ecx -; AVX512-NEXT: subl $-128, %ecx -; AVX512-NEXT: movq %r8, %rsi -; AVX512-NEXT: orq %r12, %rsi -; AVX512-NEXT: cmovnel %r9d, %ecx -; AVX512-NEXT: lzcntq %rbx, %rdi -; AVX512-NEXT: lzcntq %r15, %rsi -; AVX512-NEXT: addl $64, %esi -; AVX512-NEXT: testq %rbx, %rbx -; AVX512-NEXT: cmovnel %edi, %esi -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; AVX512-NEXT: lzcntq %r13, %rbp -; AVX512-NEXT: addl $64, %ebp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; AVX512-NEXT: lzcntq %r9, %rdi -; AVX512-NEXT: testq %r9, %r9 -; AVX512-NEXT: cmovnel %edi, %ebp -; AVX512-NEXT: subl $-128, %ebp -; AVX512-NEXT: movq %r15, %rdi -; AVX512-NEXT: orq %rbx, %rdi -; AVX512-NEXT: cmovnel %esi, %ebp -; AVX512-NEXT: addl $256, %ebp # imm = 0x100 -; AVX512-NEXT: movq %r10, %rdi -; AVX512-NEXT: orq %r12, %rdi -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: orq %r8, %rsi -; AVX512-NEXT: orq %rdi, %rsi -; AVX512-NEXT: cmovnel %ecx, %ebp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: lzcntq %rdi, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512-NEXT: lzcntq %r12, %rcx -; AVX512-NEXT: testq %r12, %r12 -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: lzcntq %r11, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: lzcntq %r14, %rsi -; AVX512-NEXT: testq %r14, %r14 -; AVX512-NEXT: cmovnel %esi, %ecx -; AVX512-NEXT: subl $-128, %ecx -; AVX512-NEXT: movq %rdi, %rsi -; AVX512-NEXT: orq %r12, %rsi -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: movq %rdx, %rdi -; AVX512-NEXT: lzcntq %rdx, %rdx -; AVX512-NEXT: addl $64, %edx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: lzcntq %r10, %rax -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %eax, %edx -; AVX512-NEXT: lzcntq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT: lzcntq %rsi, %r8 -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %r8d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r10, %rdi -; AVX512-NEXT: cmovnel %edx, %eax -; AVX512-NEXT: orq %r12, %r14 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512-NEXT: vmovq %rdi, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovq %rdx, %xmm1 +; AVX512-NEXT: vmovq %rcx, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512-NEXT: vmovq %r8, %xmm2 +; AVX512-NEXT: vmovq %r9, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448] +; AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vmovd %xmm0, %ecx +; AVX512-NEXT: addl $512, %ecx # imm = 0x200 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0 +; AVX512-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r14 ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r14, %r11 -; AVX512-NEXT: cmovnel %ecx, %eax ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; AVX512-NEXT: orq %rbx, %r9 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r15 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r13 -; AVX512-NEXT: orq %r15, %r13 -; AVX512-NEXT: addl $512, %eax # imm = 0x200 -; AVX512-NEXT: orq %r9, %r13 -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: orq %r14, %r11 +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: orq %rbx, %r10 +; AVX512-NEXT: orq %r11, %r10 +; AVX512-NEXT: cmovel %ecx, %eax ; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 ; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 0) %res = trunc i1024 %cnt to i32 @@ -1626,116 +1562,35 @@ define i32 @load_ctlz_i1024(ptr %p0) nounwind { ; ; AVX512-LABEL: load_ctlz_i1024: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq 32(%rdi), %r14 -; AVX512-NEXT: movq 48(%rdi), %rbp -; AVX512-NEXT: movq 64(%rdi), %r11 -; AVX512-NEXT: movq 72(%rdi), %r10 -; AVX512-NEXT: movq 80(%rdi), %rdx -; AVX512-NEXT: movq 88(%rdi), %rbx -; AVX512-NEXT: movq 96(%rdi), %rsi -; AVX512-NEXT: movq 104(%rdi), %r9 -; AVX512-NEXT: movq 112(%rdi), %r8 -; AVX512-NEXT: movq 120(%rdi), %r15 -; AVX512-NEXT: lzcntq %r15, %rax -; AVX512-NEXT: lzcntq %r8, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: testq %r15, %r15 -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: lzcntq %r9, %r12 -; AVX512-NEXT: lzcntq %rsi, %rax -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r9, %r9 -; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: movq %r8, %r12 -; AVX512-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: orq %r15, %r12 -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: lzcntq %rbx, %rcx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: lzcntq %rdx, %r13 -; AVX512-NEXT: addl $64, %r13d -; AVX512-NEXT: testq %rbx, %rbx -; AVX512-NEXT: cmovnel %ecx, %r13d -; AVX512-NEXT: lzcntq %r10, %rcx -; AVX512-NEXT: lzcntq %r11, %r12 -; AVX512-NEXT: addl $64, %r12d -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %ecx, %r12d -; AVX512-NEXT: subl $-128, %r12d -; AVX512-NEXT: movq %rdx, %rcx -; AVX512-NEXT: orq %rbx, %rcx -; AVX512-NEXT: cmovnel %r13d, %r12d -; AVX512-NEXT: addl $256, %r12d # imm = 0x100 -; AVX512-NEXT: movq %r9, %rcx -; AVX512-NEXT: orq %r15, %rcx -; AVX512-NEXT: orq %r8, %rsi -; AVX512-NEXT: orq %rcx, %rsi -; AVX512-NEXT: movq 56(%rdi), %r13 -; AVX512-NEXT: cmovnel %eax, %r12d -; AVX512-NEXT: lzcntq %r13, %rcx -; AVX512-NEXT: movq %rbp, %rsi -; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: lzcntq %rbp, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r13, %r13 -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: lzcntq %r14, %rbp -; AVX512-NEXT: addl $64, %ebp -; AVX512-NEXT: movq 40(%rdi), %r8 -; AVX512-NEXT: lzcntq %r8, %rdx -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %edx, %ebp -; AVX512-NEXT: subl $-128, %ebp -; AVX512-NEXT: movq %rsi, %rdx -; AVX512-NEXT: orq %r13, %rdx -; AVX512-NEXT: cmovnel %eax, %ebp -; AVX512-NEXT: movq 16(%rdi), %r9 -; AVX512-NEXT: lzcntq %r9, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: movq 24(%rdi), %rdx -; AVX512-NEXT: lzcntq %rdx, %rax -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: movq 8(%rdi), %rsi -; AVX512-NEXT: lzcntq (%rdi), %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: lzcntq %rsi, %rdi -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %edi, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %rdx, %r9 -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: orq %r13, %r8 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r8, %r14 -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: orq %r15, %rbx -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; AVX512-NEXT: orq %rbx, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; AVX512-NEXT: orq %rcx, %r11 +; AVX512-NEXT: movq 80(%rdi), %rsi +; AVX512-NEXT: movq 64(%rdi), %rcx +; AVX512-NEXT: movq 72(%rdi), %rdx +; AVX512-NEXT: movq 88(%rdi), %r8 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512-NEXT: vpermq 64(%rdi), %zmm0, %zmm1 +; AVX512-NEXT: vplzcntq %zmm1, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448] +; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z} +; AVX512-NEXT: vmovd %xmm1, %r9d +; AVX512-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: addl $512, %eax # imm = 0x200 -; AVX512-NEXT: orq %r10, %r11 -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: orq 120(%rdi), %r8 +; AVX512-NEXT: orq 104(%rdi), %rdx +; AVX512-NEXT: orq 112(%rdi), %rsi +; AVX512-NEXT: orq %r8, %rdx +; AVX512-NEXT: orq 96(%rdi), %rcx +; AVX512-NEXT: orq %rsi, %rcx +; AVX512-NEXT: orq %rdx, %rcx +; AVX512-NEXT: cmovnel %r9d, %eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a0 = load i1024, ptr %p0 %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 0) @@ -1944,8 +1799,7 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind { ; AVX512-NEXT: vplzcntq %ymm0, %ymm0 ; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vpcompressq %ymm0, %ymm0 {%k1} {z} -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a0 = load i256, ptr %p0 @@ -2210,8 +2064,7 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind { ; AVX512-NEXT: vplzcntq %zmm0, %zmm0 ; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a0 = load i512, ptr %p0 @@ -2475,113 +2328,50 @@ define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind { ; ; AVX512-LABEL: test_ctlz_undef_i1024: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 ; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq %r9, %r14 -; AVX512-NEXT: movq %r8, %r11 -; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512-NEXT: lzcntq %r12, %rcx -; AVX512-NEXT: lzcntq %r8, %r9 -; AVX512-NEXT: addl $64, %r9d -; AVX512-NEXT: testq %r12, %r12 -; AVX512-NEXT: cmovnel %ecx, %r9d -; AVX512-NEXT: lzcntq %r10, %rsi -; AVX512-NEXT: lzcntq %rax, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %esi, %ecx -; AVX512-NEXT: subl $-128, %ecx -; AVX512-NEXT: movq %r8, %rsi -; AVX512-NEXT: orq %r12, %rsi -; AVX512-NEXT: cmovnel %r9d, %ecx -; AVX512-NEXT: lzcntq %rbx, %rdi -; AVX512-NEXT: lzcntq %r15, %rsi -; AVX512-NEXT: addl $64, %esi -; AVX512-NEXT: testq %rbx, %rbx -; AVX512-NEXT: cmovnel %edi, %esi -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; AVX512-NEXT: lzcntq %r13, %rbp -; AVX512-NEXT: addl $64, %ebp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; AVX512-NEXT: lzcntq %r9, %rdi -; AVX512-NEXT: testq %r9, %r9 -; AVX512-NEXT: cmovnel %edi, %ebp -; AVX512-NEXT: subl $-128, %ebp -; AVX512-NEXT: movq %r15, %rdi -; AVX512-NEXT: orq %rbx, %rdi -; AVX512-NEXT: cmovnel %esi, %ebp -; AVX512-NEXT: addl $256, %ebp # imm = 0x100 -; AVX512-NEXT: movq %r10, %rdi -; AVX512-NEXT: orq %r12, %rdi -; AVX512-NEXT: movq %rax, %rsi -; AVX512-NEXT: orq %r8, %rsi -; AVX512-NEXT: orq %rdi, %rsi -; AVX512-NEXT: cmovnel %ecx, %ebp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: lzcntq %rdi, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512-NEXT: lzcntq %r12, %rcx -; AVX512-NEXT: testq %r12, %r12 -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: lzcntq %r11, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: lzcntq %r14, %rsi -; AVX512-NEXT: testq %r14, %r14 -; AVX512-NEXT: cmovnel %esi, %ecx -; AVX512-NEXT: subl $-128, %ecx -; AVX512-NEXT: movq %rdi, %rsi -; AVX512-NEXT: orq %r12, %rsi -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: movq %rdx, %rdi -; AVX512-NEXT: lzcntq %rdx, %rdx -; AVX512-NEXT: addl $64, %edx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: lzcntq %r10, %rax -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %eax, %edx -; AVX512-NEXT: lzcntq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT: lzcntq %rsi, %r8 -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %r8d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r10, %rdi -; AVX512-NEXT: cmovnel %edx, %eax -; AVX512-NEXT: orq %r12, %r14 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512-NEXT: vmovq %rdi, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovq %rdx, %xmm1 +; AVX512-NEXT: vmovq %rcx, %xmm2 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512-NEXT: vmovq %r8, %xmm2 +; AVX512-NEXT: vmovq %r9, %xmm3 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448] +; AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovd %xmm0, %ecx +; AVX512-NEXT: addl $512, %ecx # imm = 0x200 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0 +; AVX512-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r14 ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r14, %r11 -; AVX512-NEXT: cmovnel %ecx, %eax ; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; AVX512-NEXT: orq %rbx, %r9 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r15 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r13 -; AVX512-NEXT: orq %r15, %r13 -; AVX512-NEXT: addl $512, %eax # imm = 0x200 -; AVX512-NEXT: orq %r9, %r13 -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512-NEXT: orq %r14, %r11 +; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: orq %rbx, %r10 +; AVX512-NEXT: orq %r11, %r10 +; AVX512-NEXT: cmovel %ecx, %eax ; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 ; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 -1) %res = trunc i1024 %cnt to i32 @@ -2850,116 +2640,34 @@ define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind { ; ; AVX512-LABEL: load_ctlz_undef_i1024: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq 32(%rdi), %r14 -; AVX512-NEXT: movq 48(%rdi), %rbp -; AVX512-NEXT: movq 64(%rdi), %r11 -; AVX512-NEXT: movq 72(%rdi), %r10 -; AVX512-NEXT: movq 80(%rdi), %rdx -; AVX512-NEXT: movq 88(%rdi), %rbx -; AVX512-NEXT: movq 96(%rdi), %rsi -; AVX512-NEXT: movq 104(%rdi), %r9 -; AVX512-NEXT: movq 112(%rdi), %r8 -; AVX512-NEXT: movq 120(%rdi), %r15 -; AVX512-NEXT: lzcntq %r15, %rax -; AVX512-NEXT: lzcntq %r8, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: testq %r15, %r15 -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: lzcntq %r9, %r12 -; AVX512-NEXT: lzcntq %rsi, %rax -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r9, %r9 -; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: movq %r8, %r12 -; AVX512-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: orq %r15, %r12 -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: lzcntq %rbx, %rcx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: lzcntq %rdx, %r13 -; AVX512-NEXT: addl $64, %r13d -; AVX512-NEXT: testq %rbx, %rbx -; AVX512-NEXT: cmovnel %ecx, %r13d -; AVX512-NEXT: lzcntq %r10, %rcx -; AVX512-NEXT: lzcntq %r11, %r12 -; AVX512-NEXT: addl $64, %r12d -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %ecx, %r12d -; AVX512-NEXT: subl $-128, %r12d -; AVX512-NEXT: movq %rdx, %rcx -; AVX512-NEXT: orq %rbx, %rcx -; AVX512-NEXT: cmovnel %r13d, %r12d -; AVX512-NEXT: addl $256, %r12d # imm = 0x100 -; AVX512-NEXT: movq %r9, %rcx -; AVX512-NEXT: orq %r15, %rcx -; AVX512-NEXT: orq %r8, %rsi -; AVX512-NEXT: orq %rcx, %rsi -; AVX512-NEXT: movq 56(%rdi), %r13 -; AVX512-NEXT: cmovnel %eax, %r12d -; AVX512-NEXT: lzcntq %r13, %rcx -; AVX512-NEXT: movq %rbp, %rsi -; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: lzcntq %rbp, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r13, %r13 -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: lzcntq %r14, %rbp -; AVX512-NEXT: addl $64, %ebp -; AVX512-NEXT: movq 40(%rdi), %r8 -; AVX512-NEXT: lzcntq %r8, %rdx -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %edx, %ebp -; AVX512-NEXT: subl $-128, %ebp -; AVX512-NEXT: movq %rsi, %rdx -; AVX512-NEXT: orq %r13, %rdx -; AVX512-NEXT: cmovnel %eax, %ebp -; AVX512-NEXT: movq 16(%rdi), %r9 -; AVX512-NEXT: lzcntq %r9, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: movq 24(%rdi), %rdx -; AVX512-NEXT: lzcntq %rdx, %rax -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: movq 8(%rdi), %rsi -; AVX512-NEXT: lzcntq (%rdi), %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: lzcntq %rsi, %rdi -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %edi, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %rdx, %r9 -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: orq %r13, %r8 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r8, %r14 -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: orq %r15, %rbx -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; AVX512-NEXT: orq %rbx, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; AVX512-NEXT: orq %rcx, %r11 +; AVX512-NEXT: movq 80(%rdi), %rsi +; AVX512-NEXT: movq 64(%rdi), %rcx +; AVX512-NEXT: movq 72(%rdi), %rdx +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; AVX512-NEXT: vpermq 64(%rdi), %zmm0, %zmm1 +; AVX512-NEXT: movq 88(%rdi), %r8 +; AVX512-NEXT: vplzcntq %zmm1, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448] +; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z} +; AVX512-NEXT: vmovd %xmm1, %r9d +; AVX512-NEXT: vpermq (%rdi), %zmm0, %zmm0 +; AVX512-NEXT: vplzcntq %zmm0, %zmm1 +; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: addl $512, %eax # imm = 0x200 -; AVX512-NEXT: orq %r10, %r11 -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: orq 120(%rdi), %r8 +; AVX512-NEXT: orq 104(%rdi), %rdx +; AVX512-NEXT: orq 112(%rdi), %rsi +; AVX512-NEXT: orq %r8, %rdx +; AVX512-NEXT: orq 96(%rdi), %rcx +; AVX512-NEXT: orq %rsi, %rcx +; AVX512-NEXT: orq %rdx, %rcx +; AVX512-NEXT: cmovnel %r9d, %eax +; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a0 = load i1024, ptr %p0 %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 -1) @@ -3165,8 +2873,7 @@ define i32 @load_cttz_i256(ptr %p0) nounwind { ; AVX512F-NEXT: vptestmq %ymm0, %ymm0, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] ; AVX512F-NEXT: vpcompressq %ymm1, %ymm0 {%k1} -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3181,8 +2888,7 @@ define i32 @load_cttz_i256(ptr %p0) nounwind { ; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1 ; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256] ; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1} -; AVX512POPCNT-NEXT: vmovq %xmm0, %rax -; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq %a0 = load i256, ptr %p0 @@ -3459,8 +3165,7 @@ define i32 @load_cttz_i512(ptr %p0) nounwind { ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] ; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3475,8 +3180,7 @@ define i32 @load_cttz_i512(ptr %p0) nounwind { ; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] ; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} -; AVX512POPCNT-NEXT: vmovq %xmm0, %rax -; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq %a0 = load i512, ptr %p0 @@ -3716,111 +3420,93 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind { ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cttz_i1024: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq %r9, %r14 -; AVX512-NEXT: movq %r8, %r15 -; AVX512-NEXT: movq %rcx, %r11 -; AVX512-NEXT: movq %rdx, %r10 -; AVX512-NEXT: movq %rsi, %r9 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: tzcntq %rdi, %rax -; AVX512-NEXT: tzcntq %r9, %r12 -; AVX512-NEXT: addl $64, %r12d -; AVX512-NEXT: testq %rdi, %rdi -; AVX512-NEXT: cmovnel %eax, %r12d -; AVX512-NEXT: tzcntq %rdx, %r13 -; AVX512-NEXT: tzcntq %r11, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %r13d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: movq %rdi, %r13 -; AVX512-NEXT: orq %r9, %r13 -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: tzcntq %r8, %r12 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: tzcntq %r14, %r13 -; AVX512-NEXT: addl $64, %r13d -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %r12d, %r13d -; AVX512-NEXT: tzcntq %rcx, %rbp -; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %r12 -; AVX512-NEXT: addl $64, %r12d -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %ebp, %r12d -; AVX512-NEXT: subl $-128, %r12d -; AVX512-NEXT: movq %r8, %rbp -; AVX512-NEXT: orq %r14, %rbp -; AVX512-NEXT: cmovnel %r13d, %r12d -; AVX512-NEXT: addl $256, %r12d # imm = 0x100 -; AVX512-NEXT: movq %r9, %r13 -; AVX512-NEXT: orq %r11, %r13 -; AVX512-NEXT: movq %rdi, %rbp -; AVX512-NEXT: orq %rdx, %rbp -; AVX512-NEXT: orq %r13, %rbp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; AVX512-NEXT: cmovnel %eax, %r12d -; AVX512-NEXT: tzcntq %rbx, %rbp -; AVX512-NEXT: tzcntq %r13, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rbx, %rbx -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rbp -; AVX512-NEXT: addl $64, %ebp -; AVX512-NEXT: tzcntq %rsi, %rcx -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %ecx, %ebp -; AVX512-NEXT: subl $-128, %ebp -; AVX512-NEXT: movq %rbx, %rcx -; AVX512-NEXT: orq %r13, %rcx -; AVX512-NEXT: cmovnel %eax, %ebp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; AVX512-NEXT: tzcntq %r14, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: tzcntq %rdx, %rax -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: tzcntq %r8, %rsi -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %esi, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r14, %rdx -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r13 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r13, %rbx -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT: orq %r11, %r9 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: orq %r15, %rdi -; AVX512-NEXT: orq %r10, %rdi -; AVX512-NEXT: addl $512, %eax # imm = 0x200 -; AVX512-NEXT: orq %r9, %rdi -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cttz_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rsi, %xmm1 +; AVX512F-NEXT: vmovq %rdi, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vmovq %r9, %xmm2 +; AVX512F-NEXT: vmovq %r8, %xmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vpandnq %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %r10d +; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512F-NEXT: vpsubq %zmm0, %zmm4, %zmm0 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: addl $512, %eax # imm = 0x200 +; AVX512F-NEXT: orq %r9, %rsi +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rcx +; AVX512F-NEXT: orq %rsi, %rcx +; AVX512F-NEXT: orq %r8, %rdi +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512F-NEXT: orq %rdi, %rdx +; AVX512F-NEXT: orq %rcx, %rdx +; AVX512F-NEXT: cmovnel %r10d, %eax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512POPCNT-LABEL: test_cttz_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vmovq %r9, %xmm2 +; AVX512POPCNT-NEXT: vmovq %r8, %xmm3 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2 +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1 +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm3 +; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm0, %zmm3 +; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448] +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %r10d +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm1 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm1, %eax +; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200 +; AVX512POPCNT-NEXT: orq %r9, %rsi +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rcx +; AVX512POPCNT-NEXT: orq %rsi, %rcx +; AVX512POPCNT-NEXT: orq %r8, %rdi +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512POPCNT-NEXT: orq %rdi, %rdx +; AVX512POPCNT-NEXT: orq %rcx, %rdx +; AVX512POPCNT-NEXT: cmovnel %r10d, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0) %res = trunc i1024 %cnt to i32 ret i32 %res @@ -4069,120 +3755,79 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind { ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_cttz_i1024: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq 88(%rdi), %rbp -; AVX512-NEXT: movq 72(%rdi), %r15 -; AVX512-NEXT: movq 56(%rdi), %r9 -; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rdi), %rcx -; AVX512-NEXT: movq 40(%rdi), %r10 -; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 32(%rdi), %rsi -; AVX512-NEXT: movq 24(%rdi), %r14 -; AVX512-NEXT: movq 16(%rdi), %rbx -; AVX512-NEXT: movq (%rdi), %r8 -; AVX512-NEXT: movq 8(%rdi), %r11 -; AVX512-NEXT: tzcntq %r8, %rax -; AVX512-NEXT: tzcntq %r11, %rdx -; AVX512-NEXT: addl $64, %edx -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %eax, %edx -; AVX512-NEXT: tzcntq %rbx, %r12 -; AVX512-NEXT: tzcntq %r14, %rax -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rbx, %rbx -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: movq %r8, %r12 -; AVX512-NEXT: orq %r11, %r12 -; AVX512-NEXT: cmovnel %edx, %eax -; AVX512-NEXT: tzcntq %rsi, %rdx -; AVX512-NEXT: tzcntq %r10, %r13 -; AVX512-NEXT: addl $64, %r13d -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: cmovnel %edx, %r13d -; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: tzcntq %rcx, %rdx -; AVX512-NEXT: tzcntq %r9, %r12 -; AVX512-NEXT: addl $64, %r12d -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %edx, %r12d -; AVX512-NEXT: subl $-128, %r12d -; AVX512-NEXT: movq %rsi, %rdx -; AVX512-NEXT: orq %r10, %rdx -; AVX512-NEXT: cmovnel %r13d, %r12d -; AVX512-NEXT: addl $256, %r12d # imm = 0x100 -; AVX512-NEXT: movq %r11, %rdx -; AVX512-NEXT: orq %r14, %rdx -; AVX512-NEXT: movq %r8, %r13 -; AVX512-NEXT: orq %rbx, %r13 -; AVX512-NEXT: orq %rdx, %r13 -; AVX512-NEXT: movq 64(%rdi), %r13 -; AVX512-NEXT: cmovnel %eax, %r12d -; AVX512-NEXT: tzcntq %r13, %rdx -; AVX512-NEXT: tzcntq %r15, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r13, %r13 -; AVX512-NEXT: cmovnel %edx, %eax -; AVX512-NEXT: movq %rbp, %r14 -; AVX512-NEXT: tzcntq %rbp, %rbp -; AVX512-NEXT: addl $64, %ebp -; AVX512-NEXT: movq 80(%rdi), %r10 -; AVX512-NEXT: tzcntq %r10, %rcx -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %ecx, %ebp -; AVX512-NEXT: subl $-128, %ebp -; AVX512-NEXT: movq %r13, %rcx -; AVX512-NEXT: orq %r15, %rcx -; AVX512-NEXT: cmovnel %eax, %ebp -; AVX512-NEXT: movq 104(%rdi), %r9 -; AVX512-NEXT: tzcntq %r9, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: movq 96(%rdi), %rdx -; AVX512-NEXT: tzcntq %rdx, %rax -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: movq 112(%rdi), %rsi -; AVX512-NEXT: tzcntq 120(%rdi), %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: tzcntq %rsi, %rdi -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %edi, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r9, %rdx -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: orq %r14, %r15 -; AVX512-NEXT: orq %r10, %r13 -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r15, %r13 -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; AVX512-NEXT: orq %rcx, %r11 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT: orq %rbx, %r8 -; AVX512-NEXT: addl $512, %eax # imm = 0x200 -; AVX512-NEXT: orq %r11, %r8 -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: retq +; AVX512F-LABEL: load_cttz_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 +; AVX512F-NEXT: movq 16(%rdi), %rax +; AVX512F-NEXT: movq (%rdi), %rcx +; AVX512F-NEXT: movq 8(%rdi), %rdx +; AVX512F-NEXT: movq 24(%rdi), %rsi +; AVX512F-NEXT: orq 56(%rdi), %rsi +; AVX512F-NEXT: orq 40(%rdi), %rdx +; AVX512F-NEXT: orq %rsi, %rdx +; AVX512F-NEXT: orq 48(%rdi), %rax +; AVX512F-NEXT: orq 32(%rdi), %rcx +; AVX512F-NEXT: orq %rax, %rcx +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm3 +; AVX512F-NEXT: vpandnq %zmm3, %zmm1, %zmm3 +; AVX512F-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm1, %esi +; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vpsubq %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: addl $512, %eax # imm = 0x200 +; AVX512F-NEXT: orq %rdx, %rcx +; AVX512F-NEXT: cmovnel %esi, %eax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512POPCNT-LABEL: load_cttz_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0 +; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm1 +; AVX512POPCNT-NEXT: movq 16(%rdi), %rax +; AVX512POPCNT-NEXT: movq (%rdi), %rcx +; AVX512POPCNT-NEXT: movq 8(%rdi), %rdx +; AVX512POPCNT-NEXT: movq 24(%rdi), %rsi +; AVX512POPCNT-NEXT: orq 56(%rdi), %rsi +; AVX512POPCNT-NEXT: orq 40(%rdi), %rdx +; AVX512POPCNT-NEXT: orq %rsi, %rdx +; AVX512POPCNT-NEXT: orq 48(%rdi), %rax +; AVX512POPCNT-NEXT: orq 32(%rdi), %rcx +; AVX512POPCNT-NEXT: orq %rax, %rcx +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm3 +; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm1, %zmm3 +; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448] +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm1, %esi +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512] +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200 +; AVX512POPCNT-NEXT: orq %rdx, %rcx +; AVX512POPCNT-NEXT: cmovnel %esi, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %a0 = load i1024, ptr %p0 %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0) %res = trunc i1024 %cnt to i32 @@ -4382,8 +4027,7 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind { ; AVX512F-NEXT: vpsubq %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vptestmq %ymm0, %ymm0, %k1 ; AVX512F-NEXT: vpcompressq %ymm1, %ymm0 {%k1} {z} -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4397,8 +4041,7 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind { ; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1 ; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1} {z} -; AVX512POPCNT-NEXT: vmovq %xmm0, %rax -; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq %a0 = load i256, ptr %p0 @@ -4670,8 +4313,7 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind { ; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4685,8 +4327,7 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind { ; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} -; AVX512POPCNT-NEXT: vmovq %xmm0, %rax -; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax ; AVX512POPCNT-NEXT: vzeroupper ; AVX512POPCNT-NEXT: retq %a0 = load i512, ptr %p0 @@ -4924,111 +4565,91 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind { ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_cttz_undef_i1024: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq %r9, %r14 -; AVX512-NEXT: movq %r8, %r15 -; AVX512-NEXT: movq %rcx, %r11 -; AVX512-NEXT: movq %rdx, %r10 -; AVX512-NEXT: movq %rsi, %r9 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: tzcntq %rdi, %rax -; AVX512-NEXT: tzcntq %r9, %r12 -; AVX512-NEXT: addl $64, %r12d -; AVX512-NEXT: testq %rdi, %rdi -; AVX512-NEXT: cmovnel %eax, %r12d -; AVX512-NEXT: tzcntq %rdx, %r13 -; AVX512-NEXT: tzcntq %r11, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %r13d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: movq %rdi, %r13 -; AVX512-NEXT: orq %r9, %r13 -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: tzcntq %r8, %r12 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: tzcntq %r14, %r13 -; AVX512-NEXT: addl $64, %r13d -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %r12d, %r13d -; AVX512-NEXT: tzcntq %rcx, %rbp -; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %r12 -; AVX512-NEXT: addl $64, %r12d -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %ebp, %r12d -; AVX512-NEXT: subl $-128, %r12d -; AVX512-NEXT: movq %r8, %rbp -; AVX512-NEXT: orq %r14, %rbp -; AVX512-NEXT: cmovnel %r13d, %r12d -; AVX512-NEXT: addl $256, %r12d # imm = 0x100 -; AVX512-NEXT: movq %r9, %r13 -; AVX512-NEXT: orq %r11, %r13 -; AVX512-NEXT: movq %rdi, %rbp -; AVX512-NEXT: orq %rdx, %rbp -; AVX512-NEXT: orq %r13, %rbp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; AVX512-NEXT: cmovnel %eax, %r12d -; AVX512-NEXT: tzcntq %rbx, %rbp -; AVX512-NEXT: tzcntq %r13, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rbx, %rbx -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rbp -; AVX512-NEXT: addl $64, %ebp -; AVX512-NEXT: tzcntq %rsi, %rcx -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %ecx, %ebp -; AVX512-NEXT: subl $-128, %ebp -; AVX512-NEXT: movq %rbx, %rcx -; AVX512-NEXT: orq %r13, %rcx -; AVX512-NEXT: cmovnel %eax, %ebp -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; AVX512-NEXT: tzcntq %r14, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: tzcntq %rdx, %rax -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: tzcntq %r8, %rsi -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %esi, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r14, %rdx -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r13 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rbx -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r13, %rbx -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT: orq %r11, %r9 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: orq %r15, %rdi -; AVX512-NEXT: orq %r10, %rdi -; AVX512-NEXT: addl $512, %eax # imm = 0x200 -; AVX512-NEXT: orq %r9, %rdi -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: retq +; AVX512F-LABEL: test_cttz_undef_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %rcx, %xmm0 +; AVX512F-NEXT: vmovq %rdx, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vmovq %rsi, %xmm1 +; AVX512F-NEXT: vmovq %rdi, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovq %r9, %xmm1 +; AVX512F-NEXT: vmovq %r8, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vpandnq %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %r10d +; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512F-NEXT: vpsubq %zmm0, %zmm4, %zmm0 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: addl $512, %eax # imm = 0x200 +; AVX512F-NEXT: orq %r9, %rsi +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rcx +; AVX512F-NEXT: orq %rsi, %rcx +; AVX512F-NEXT: orq %r8, %rdi +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512F-NEXT: orq %rdi, %rdx +; AVX512F-NEXT: orq %rcx, %rdx +; AVX512F-NEXT: cmovnel %r10d, %eax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512POPCNT-LABEL: test_cttz_undef_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0 +; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1 +; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512POPCNT-NEXT: vmovq %r9, %xmm1 +; AVX512POPCNT-NEXT: vmovq %r8, %xmm2 +; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1 +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm3 +; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm0, %zmm3 +; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448] +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %r10d +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512POPCNT-NEXT: vpopcntq %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm0, %zmm0 +; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200 +; AVX512POPCNT-NEXT: orq %r9, %rsi +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rcx +; AVX512POPCNT-NEXT: orq %rsi, %rcx +; AVX512POPCNT-NEXT: orq %r8, %rdi +; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rdx +; AVX512POPCNT-NEXT: orq %rdi, %rdx +; AVX512POPCNT-NEXT: orq %rcx, %rdx +; AVX512POPCNT-NEXT: cmovnel %r10d, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 -1) %res = trunc i1024 %cnt to i32 ret i32 %res @@ -5276,120 +4897,77 @@ define i32 @load_cttz_undef_i1024(ptr %p0) nounwind { ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_cttz_undef_i1024: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: movq 88(%rdi), %rbp -; AVX512-NEXT: movq 72(%rdi), %r15 -; AVX512-NEXT: movq 56(%rdi), %r9 -; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rdi), %rcx -; AVX512-NEXT: movq 40(%rdi), %r10 -; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 32(%rdi), %rsi -; AVX512-NEXT: movq 24(%rdi), %r14 -; AVX512-NEXT: movq 16(%rdi), %rbx -; AVX512-NEXT: movq (%rdi), %r8 -; AVX512-NEXT: movq 8(%rdi), %r11 -; AVX512-NEXT: tzcntq %r8, %rax -; AVX512-NEXT: tzcntq %r11, %rdx -; AVX512-NEXT: addl $64, %edx -; AVX512-NEXT: testq %r8, %r8 -; AVX512-NEXT: cmovnel %eax, %edx -; AVX512-NEXT: tzcntq %rbx, %r12 -; AVX512-NEXT: tzcntq %r14, %rax -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %rbx, %rbx -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: movq %r8, %r12 -; AVX512-NEXT: orq %r11, %r12 -; AVX512-NEXT: cmovnel %edx, %eax -; AVX512-NEXT: tzcntq %rsi, %rdx -; AVX512-NEXT: tzcntq %r10, %r13 -; AVX512-NEXT: addl $64, %r13d -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: cmovnel %edx, %r13d -; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: tzcntq %rcx, %rdx -; AVX512-NEXT: tzcntq %r9, %r12 -; AVX512-NEXT: addl $64, %r12d -; AVX512-NEXT: testq %rcx, %rcx -; AVX512-NEXT: cmovnel %edx, %r12d -; AVX512-NEXT: subl $-128, %r12d -; AVX512-NEXT: movq %rsi, %rdx -; AVX512-NEXT: orq %r10, %rdx -; AVX512-NEXT: cmovnel %r13d, %r12d -; AVX512-NEXT: addl $256, %r12d # imm = 0x100 -; AVX512-NEXT: movq %r11, %rdx -; AVX512-NEXT: orq %r14, %rdx -; AVX512-NEXT: movq %r8, %r13 -; AVX512-NEXT: orq %rbx, %r13 -; AVX512-NEXT: orq %rdx, %r13 -; AVX512-NEXT: movq 64(%rdi), %r13 -; AVX512-NEXT: cmovnel %eax, %r12d -; AVX512-NEXT: tzcntq %r13, %rdx -; AVX512-NEXT: tzcntq %r15, %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: testq %r13, %r13 -; AVX512-NEXT: cmovnel %edx, %eax -; AVX512-NEXT: movq %rbp, %r14 -; AVX512-NEXT: tzcntq %rbp, %rbp -; AVX512-NEXT: addl $64, %ebp -; AVX512-NEXT: movq 80(%rdi), %r10 -; AVX512-NEXT: tzcntq %r10, %rcx -; AVX512-NEXT: testq %r10, %r10 -; AVX512-NEXT: cmovnel %ecx, %ebp -; AVX512-NEXT: subl $-128, %ebp -; AVX512-NEXT: movq %r13, %rcx -; AVX512-NEXT: orq %r15, %rcx -; AVX512-NEXT: cmovnel %eax, %ebp -; AVX512-NEXT: movq 104(%rdi), %r9 -; AVX512-NEXT: tzcntq %r9, %rcx -; AVX512-NEXT: addl $64, %ecx -; AVX512-NEXT: movq 96(%rdi), %rdx -; AVX512-NEXT: tzcntq %rdx, %rax -; AVX512-NEXT: testq %rdx, %rdx -; AVX512-NEXT: cmovnel %eax, %ecx -; AVX512-NEXT: movq 112(%rdi), %rsi -; AVX512-NEXT: tzcntq 120(%rdi), %rax -; AVX512-NEXT: addl $64, %eax -; AVX512-NEXT: tzcntq %rsi, %rdi -; AVX512-NEXT: testq %rsi, %rsi -; AVX512-NEXT: cmovnel %edi, %eax -; AVX512-NEXT: subl $-128, %eax -; AVX512-NEXT: orq %r9, %rdx -; AVX512-NEXT: cmovnel %ecx, %eax -; AVX512-NEXT: orq %r14, %r15 -; AVX512-NEXT: orq %r10, %r13 -; AVX512-NEXT: addl $256, %eax # imm = 0x100 -; AVX512-NEXT: orq %r15, %r13 -; AVX512-NEXT: cmovnel %ebp, %eax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; AVX512-NEXT: orq %rcx, %r11 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT: orq %rbx, %r8 -; AVX512-NEXT: addl $512, %eax # imm = 0x200 -; AVX512-NEXT: orq %r11, %r8 -; AVX512-NEXT: cmovnel %r12d, %eax -; AVX512-NEXT: # kill: def $eax killed $eax killed $rax -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: retq +; AVX512F-LABEL: load_cttz_undef_i1024: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 +; AVX512F-NEXT: movq 16(%rdi), %rax +; AVX512F-NEXT: movq (%rdi), %rcx +; AVX512F-NEXT: movq 8(%rdi), %rdx +; AVX512F-NEXT: movq 24(%rdi), %rsi +; AVX512F-NEXT: orq 56(%rdi), %rsi +; AVX512F-NEXT: orq 40(%rdi), %rdx +; AVX512F-NEXT: orq 48(%rdi), %rax +; AVX512F-NEXT: orq 32(%rdi), %rcx +; AVX512F-NEXT: orq %rsi, %rdx +; AVX512F-NEXT: orq %rax, %rcx +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm3 +; AVX512F-NEXT: vpandnq %zmm3, %zmm1, %zmm3 +; AVX512F-NEXT: vplzcntq %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512] +; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm1, %esi +; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vplzcntq %zmm1, %zmm1 +; AVX512F-NEXT: vpsubq %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: addl $512, %eax # imm = 0x200 +; AVX512F-NEXT: orq %rdx, %rcx +; AVX512F-NEXT: cmovnel %esi, %eax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512POPCNT-LABEL: load_cttz_undef_i1024: +; AVX512POPCNT: # %bb.0: +; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0 +; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm1 +; AVX512POPCNT-NEXT: movq 16(%rdi), %rax +; AVX512POPCNT-NEXT: movq (%rdi), %rcx +; AVX512POPCNT-NEXT: movq 8(%rdi), %rdx +; AVX512POPCNT-NEXT: movq 24(%rdi), %rsi +; AVX512POPCNT-NEXT: orq 56(%rdi), %rsi +; AVX512POPCNT-NEXT: orq 40(%rdi), %rdx +; AVX512POPCNT-NEXT: orq 48(%rdi), %rax +; AVX512POPCNT-NEXT: orq 32(%rdi), %rcx +; AVX512POPCNT-NEXT: orq %rsi, %rdx +; AVX512POPCNT-NEXT: orq %rax, %rcx +; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1 +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm3 +; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm1, %zmm3 +; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448] +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm1, %esi +; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1 +; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm1, %zmm1 +; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z} +; AVX512POPCNT-NEXT: vmovd %xmm0, %eax +; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200 +; AVX512POPCNT-NEXT: orq %rdx, %rcx +; AVX512POPCNT-NEXT: cmovnel %esi, %eax +; AVX512POPCNT-NEXT: vzeroupper +; AVX512POPCNT-NEXT: retq %a0 = load i1024, ptr %p0 %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 -1) %res = trunc i1024 %cnt to i32 diff --git a/llvm/test/TableGen/CPtrWildcard.td b/llvm/test/TableGen/CPtrWildcard.td index 230a6730c610a..867d6f85bdecb 100644 --- a/llvm/test/TableGen/CPtrWildcard.td +++ b/llvm/test/TableGen/CPtrWildcard.td @@ -8,13 +8,13 @@ // CHECK-NEXT:/* 3*/ OPC_CheckChild0Integer, [[#]], // CHECK-NEXT:/* 5*/ OPC_RecordChild1, // #0 = $src // CHECK-NEXT:/* 6*/ OPC_Scope, 9, /*->17*/ // 2 children in Scope -// CHECK-NEXT:/* 8*/ OPC_CheckChild1Type, /*MVT::c64*/126|128,1/*254*/, +// CHECK-NEXT:/* 8*/ OPC_CheckChild1Type, /*MVT::c64*/0|128,2/*256*/, // CHECK-NEXT:/* 11*/ OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C64_TO_I64), // CHECK-NEXT: /*MVT::i64*/8, 1/*#Ops*/, 0, // CHECK-NEXT: // Src: (intrinsic_wo_chain:{ *:[i64] } [[#]]:{ *:[iPTR] }, c64:{ *:[c64] }:$src) - Complexity = 8 // CHECK-NEXT: // Dst: (C64_TO_I64:{ *:[i64] } ?:{ *:[c64] }:$src) // CHECK-NEXT:/* 17*/ /*Scope*/ 9, /*->27*/ -// CHECK-NEXT:/* 18*/ OPC_CheckChild1Type, /*MVT::c128*/127|128,1/*255*/, +// CHECK-NEXT:/* 18*/ OPC_CheckChild1Type, /*MVT::c128*/1|128,2/*257*/, // CHECK-NEXT:/* 21*/ OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C128_TO_I64), // CHECK-NEXT: /*MVT::i64*/8, 1/*#Ops*/, 0, // CHECK-NEXT: // Src: (intrinsic_wo_chain:{ *:[i64] } [[#]]:{ *:[iPTR] }, c128:{ *:[c128] }:$src) - Complexity = 8