46 changes: 35 additions & 11 deletions llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
Original file line number Diff line number Diff line change
Expand Up @@ -100,17 +100,41 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
; SSE-LABEL: @store_i64(
; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
; SSE-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
; SSE-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
; SSE-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
; SSE-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
; SSE-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
; SSE-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
; SSE-NEXT: [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64>
; SSE-NEXT: store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
; SSE-NEXT: store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
; SSE-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
; SSE-NEXT: store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
; SSE-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
; SSE-NEXT: store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
; SSE-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
; SSE-NEXT: store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: ret void
;
; AVX-LABEL: @store_i64(
Expand Down
3 changes: 2 additions & 1 deletion mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1289,13 +1289,14 @@ struct AngleOpConversion : public OpConversionPattern<complex::AngleOp> {
ConversionPatternRewriter &rewriter) const override {
auto loc = op.getLoc();
auto type = op.getType();
arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();

Value real =
rewriter.create<complex::ReOp>(loc, type, adaptor.getComplex());
Value imag =
rewriter.create<complex::ImOp>(loc, type, adaptor.getComplex());

rewriter.replaceOpWithNewOp<math::Atan2Op>(op, imag, real);
rewriter.replaceOpWithNewOp<math::Atan2Op>(op, imag, real, fmf);

return success();
}
Expand Down
54 changes: 41 additions & 13 deletions mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1412,10 +1412,11 @@ static SmallVector<int64_t> getTiledPackShape(tensor::PackOp packOp,

/// Create a TransferReadOp from `source` with static shape `readShape`. If the
/// vector type for the read is not the same as the type of `source`, then a
/// mask is created on the read.
/// mask is created on the read. If `doMasking` parameter is set to false we
/// update the `inBounds` attribute instead of masking.
static Value createReadOrMaskedRead(OpBuilder &builder, Location loc,
Value source, ArrayRef<int64_t> readShape,
Value padValue) {
Value padValue, bool doMasking = true) {
assert(llvm::none_of(readShape,
[](int64_t s) { return s == ShapedType::kDynamic; }));
auto sourceShape = dyn_cast<ShapedType>(source.getType()).getShape();
Expand All @@ -1424,14 +1425,21 @@ static Value createReadOrMaskedRead(OpBuilder &builder, Location loc,
auto vectorType = VectorType::get(readShape, padValue.getType());
int64_t readRank = readShape.size();
auto zero = builder.create<arith::ConstantIndexOp>(loc, 0);
SmallVector<bool> inBoundsVal(readRank, true);
if (!doMasking) {
// Update the inBounds attribute.
for (unsigned i = 0; i < readRank; i++)
inBoundsVal[i] = sourceShape[i] == readShape[i];
}
auto transferReadOp = builder.create<vector::TransferReadOp>(
loc,
/*vectorType=*/vectorType,
/*source=*/source,
/*indices=*/SmallVector<Value>(readRank, zero),
/*padding=*/padValue,
/*inBounds=*/SmallVector<bool>(readRank, true));
if (llvm::equal(readShape, sourceShape)) {
/*inBounds=*/inBoundsVal);

if (llvm::equal(readShape, sourceShape) || !doMasking) {
return transferReadOp;
}
SmallVector<OpFoldResult> mixedSourceDims =
Expand Down Expand Up @@ -1482,11 +1490,10 @@ static Operation *createWriteOrMaskedWrite(OpBuilder &builder, Location loc,
return write;
}

/// Vectorize tensor::PackOp with (1) static innerTiles and (2) constant
/// padding value into:
/// Vectorize tensor::PackOp with (1) static innerTiles (2) constant
/// padding value and (3) input vector sizes into:
/// masked_transfer_read->shape_cast->transpose->transfer_write_in_bounds
/// As in the following example:
///
/// %pack = tensor.pack %src inner_dims_pos = [2, 1] inner_tiles = [16, 2]
/// into %dst : tensor<32x8x16xf32> -> tensor<32x4x1x16x2xf32>
///
Expand All @@ -1505,6 +1512,10 @@ static Operation *createWriteOrMaskedWrite(OpBuilder &builder, Location loc,
/// %empty[%c0_0, %c0_0, %c0_0, %c0_0, %c0_0]
/// {in_bounds = [true, true, true, true, true]}
/// : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
///
/// If the (3) input vector sizes are not provided, the vector sizes are
/// determined by the result tensor shape. Also, we update the inBounds
/// attribute instead of masking.
static LogicalResult
vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
ArrayRef<int64_t> inputVectorSizes,
Expand All @@ -1525,6 +1536,16 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
(void)status; // prevent unused variable warning on non-assert builds.
assert(succeeded(status) && "failed to reify result shapes");

// If the input vector sizes are not provided, then the vector sizes are
// determined by the result tensor shape. In case the vector sizes aren't
// provided, we update the inBounds attribute instead of masking.
bool doMasking = true;
if (inputVectorSizes.empty()) {
ArrayRef<int64_t> resultTensorShape = packOp.getDestType().getShape();
inputVectorSizes = resultTensorShape.take_front(packOp.getSourceRank());
doMasking = false;
}

// Create masked TransferReadOp.
SmallVector<int64_t> inputShape(inputVectorSizes);
auto innerTiles = packOp.getStaticInnerTiles();
Expand All @@ -1536,7 +1557,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
for (auto [idx, size] : enumerate(innerTiles))
inputShape[innerDimsPos[idx]] *= size;
auto maskedRead = createReadOrMaskedRead(rewriter, loc, packOp.getSource(),
inputShape, padValue);
inputShape, padValue, doMasking);

// Create ShapeCastOp.
SmallVector<int64_t> destShape(inputVectorSizes);
Expand Down Expand Up @@ -1763,7 +1784,7 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op,
/// Returns success if `inputVectorSizes` is a valid masking configuraion for
/// given `shape`, i.e., it meets:
/// 1. The numbers of elements in both array are equal.
/// 2. `inputVectorSizes` does nos have dynamic dimensions.
/// 2. `inputVectorSizes` does not have dynamic dimensions.
/// 3. All the values in `inputVectorSizes` are greater than or equal to
/// static sizes in `shape`.
static LogicalResult
Expand Down Expand Up @@ -1881,18 +1902,25 @@ static LogicalResult vectorizeLinalgOpPrecondition(
return success();
}

/// TODO: Use a matcher to check for a constant padding value.
static LogicalResult
vectorizePackOpPrecondition(tensor::PackOp packOp,
ArrayRef<int64_t> inputVectorSizes) {
auto padValue = packOp.getPaddingValue();
if (padValue && !padValue.getDefiningOp<arith::ConstantOp>()) {
Attribute cstAttr;
if (padValue && !matchPattern(padValue, m_Constant(&cstAttr))) {
LDBG("pad value is not constant: " << packOp << "\n");
return failure();
}

ArrayRef<int64_t> resultTensorShape = packOp.getDestType().getShape();
if (failed(isValidMaskedInputVector(
bool satisfyEmptyCond = true;
if (inputVectorSizes.empty()) {
if (!packOp.getDestType().hasStaticShape() ||
!packOp.getSourceType().hasStaticShape())
satisfyEmptyCond = false;
}

if (!satisfyEmptyCond &&
failed(isValidMaskedInputVector(
resultTensorShape.take_front(packOp.getSourceRank()),
inputVectorSizes)))
return failure();
Expand Down
13 changes: 13 additions & 0 deletions mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -2187,3 +2187,16 @@ func.func @complex_tanh_nnan_ninf(%arg: complex<f32>) -> complex<f32> {

// CHECK-COUNT-1: arith.select
// CHECK-NOT: arith.select

// -----

// CHECK-LABEL: func.func @complex_angle_with_fmf
// CHECK-SAME: %[[ARG:.*]]: complex<f32>
func.func @complex_angle_with_fmf(%arg: complex<f32>) -> f32 {
%angle = complex.angle %arg fastmath<nnan,contract> : complex<f32>
return %angle : f32
}
// CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
// CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
// CHECK: %[[RESULT:.*]] = math.atan2 %[[IMAG]], %[[REAL]] fastmath<nnan,contract> : f32
// CHECK: return %[[RESULT]] : f32
17 changes: 17 additions & 0 deletions mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,20 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}

// -----

func.func @test_pack_no_vectorize_dynamic_shape(%arg0: tensor<?xf32>, %arg1: tensor<4x16xf32>) -> tensor<4x16xf32> {
%pad = arith.constant 0.000000e+00 : f32
// expected-error @+1 {{Attempted to vectorize, but failed}}
%pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [0] inner_tiles = [16] into %arg1 : tensor<?xf32> -> tensor<4x16xf32>
return %pack : tensor<4x16xf32>
}

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 : !transform.any_op
transform.yield
}
}
55 changes: 55 additions & 0 deletions mlir/test/Dialect/Linalg/vectorization.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -930,3 +930,58 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest:
transform.yield
}
}

// -----

// CHECK-LABEL: test_vectorize_pack_no_vector_sizes
func.func @test_vectorize_pack_no_vector_sizes(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
%pack = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %arg1 : tensor<64x4xf32> -> tensor<2x4x16x2xf32>
return %pack : tensor<2x4x16x2xf32>
}
// CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
// CHECK: %[[read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]]], %[[cst]]
// CHECK-SAME: {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
// CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[read]] : vector<64x4xf32> to vector<4x16x2x2xf32>
// CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
// CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<2x4x16x2xf32>
// CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
// CHECK-SAME: {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
// CHECK: return %[[write]] : tensor<2x4x16x2xf32>

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 : !transform.any_op
transform.yield
}
}

// -----

// CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes
func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
%pad = arith.constant 0.000000e+00 : f32
%pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
return %pack : tensor<32x4x1x16x2xf32>
}
// CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
// CHECK: %[[transfer_read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
// CHECK-SAME: {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
// CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[transfer_read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
// CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
// CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32>
// CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
// CHECK-SAME: {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
// CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 : !transform.any_op
transform.yield
}
}