[mlir][sparse][bufferization] refine bufferization assumption enforce…

…ment Enforce the assumption made on tensor buffers explicitly. When in-place, reuse the buffer, but fill with all zeroes for the non-update case, since the kernel assumes all elements are written to. When not in-place, zero out the new buffer when materializing or when no-updates occur. Copy the original tensor value when updates occur. This prepares migrating to the new bufferization strategy, where these assumptions must be made explicit. Reviewed By: springerm Differential Revision: https://reviews.llvm.org/D128691
llvm · Jun 28, 2022 · eca6f91 · eca6f91
1 parent 3706bda
commit eca6f91
Show file tree

Hide file tree

Showing 9 changed files with 248 additions and 118 deletions.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -450,26 +450,37 @@ static void updateReduc(Merger &merger, CodeGen &codegen, Value reduc) {
 /// the output buffer is already initialized to all zeroes and only nonzeroes
 /// values are computed and written out. For updates (viz. x(i) += y(i) * z(i)),
 /// only nonzeroes values are used for the updates and no assumption on the
-/// original contents of the output buffer is necessary..
+/// original contents of the output buffer is necessary.
 static Value genOutputBuffer(CodeGen &codegen, OpBuilder &builder,
                              linalg::GenericOp op, MemRefType denseTp,
                              ArrayRef<Value> args) {
   Location loc = op.getLoc();
-  Value tensor = op.getOutputOperand(0)->get();
-  // The output tensor simply could materialize from the buffer that will
-  // be generated for the tensor present in the outs() clause. This has
-  // the major advantage that the sparse kernel only updates the nonzero
-  // positions for the output tensor.
-  if (isInPlace(tensor))
-    return builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
-  // By default, a new buffer is allocated which is initialized to the
-  // tensor defined in the outs() clause. This is always correct but
-  // introduces a dense initialization component that may negatively
-  // impact the running complexity of the sparse kernel. If the tensor
-  // materializes into the computation, we need to preserve the zero
-  // initialization assumption of all sparse output buffers.
+  OpOperand *lhs = op.getOutputOperand(0);
+  Value tensor = lhs->get();
+  bool isInit = op.isInitTensor(lhs);
+  // An output tensor that is in-place can simply materialize from the buffer
+  // of the tensor that appears in the outs() clause. For updates, this has
+  // the advantage that only the nonzero value are involved in the computation,
+  // keeping the operation O(nnz). In all other cases, we are forced to zero
+  // out the buffer to enforce the assumption above, which may negatively
+  // impact running complexity (viz. O(n^2 + nnz) vs. O(nnz) for matrices).
+  // TODO: use better analysis to avoid zeroing out the buffer?
+  if (isInPlace(tensor)) {
+    Value init =
+        builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
+    if (!isInit) {
+      Value zero = constantZero(builder, loc, denseTp.getElementType());
+      builder.create<linalg::FillOp>(loc, ValueRange{zero}, ValueRange{init});
+    }
+    return init;
+  }
+  // By default, a new buffer is allocated which is either set to zero (when
+  // no updates occur or the tensor materializes into this computation) or
+  // initialized to the value of the tensor defined in the outs() clause.
+  // This is always correct (since it enforces all assumptions above) but
+  // may negatively impact running complexity as explained above.
   Value alloc = builder.create<memref::AllocOp>(loc, denseTp, args);
-  if (isMaterializing(tensor)) {
+  if (!isInit || isMaterializing(tensor)) {
     Value zero = constantZero(builder, loc, denseTp.getElementType());
     builder.create<linalg::FillOp>(loc, ValueRange{zero}, ValueRange{alloc});
   } else {

diff --git a/mlir/test/Dialect/SparseTensor/dense.mlir b/mlir/test/Dialect/SparseTensor/dense.mlir
@@ -35,15 +35,15 @@
 // CHECK-LABEL:   func @dense1(
 // CHECK-SAME:                 %[[VAL_0:.*]]: tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>>,
 // CHECK-SAME:                 %[[VAL_1:.*]]: tensor<32x16xf32> {linalg.inplaceable = false}) -> tensor<32x16xf32> {
+// CHECK-DAG:       %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1.000000e+00 : f32
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
 // CHECK-DAG:       %[[VAL_9:.*]] = memref.alloc() : memref<32x16xf32>
-// CHECK:           memref.copy %[[VAL_8]], %[[VAL_9]] : memref<32x16xf32> to memref<32x16xf32>
+// CHECK:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_9]] : memref<32x16xf32>)
 // CHECK:           scf.for %[[VAL_10:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             scf.for %[[VAL_11:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_6]] {
 // CHECK:               %[[VAL_12:.*]] = arith.muli %[[VAL_10]], %[[VAL_4]] : index