From ad7c49bef774191304b804e60821c0cdb6a390cb Mon Sep 17 00:00:00 2001
From: Jerry Wu <cheyuw@google.com>
Date: Mon, 9 May 2022 08:43:40 -0700
Subject: [PATCH] [mlir][linalg] Fix padding size calculation for Conv2d ops.

This patch fixed the padding size calculation for Conv2d ops when the stride > 1. It contains the changes below:

- Use addBound to add constraint for AffineApplyOp in getUpperBoundForIndex. So the result value can be mapped and retrieved later.

- Fixed the bound from AffineMinOp by adding as a closed bound. Originally the bound was added as an open upper bound, which results in the incorrect bounds when we multiply the values. For example:

```
%0 = affine.min affine_map<()[s0] -> (4, -s0 + 11)>()[iv0]
%1 = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%0]

If we add the affine.min as an open bound, addBound will internally transform it into the close bound "%0 <= 3". The following sliceBounds will derive the bound of %1 as "%1 <= 6" and return the open bound "%1 < 7", while the correct bound should be "%1 <= 8".
```

- In addition to addBound, I also changed sliceBounds to support returning closed upper bound, since for the size computation, we usually care about the closed bounds.

- Change the getUpperBoundForIndex to favor constant bounds when required. The sliceBounds will return a tighter but non-constant bounds, which can't be used for padding. The constantRequired option requires getUpperBoundForIndex to get the constant bounds when possible.

Reviewed By: hanchung

Differential Revision: https://reviews.llvm.org/D124821
---
 .../Affine/Analysis/AffineStructures.h        | 23 ++++++-
 .../include/mlir/Dialect/Linalg/Utils/Utils.h |  6 +-
 .../Affine/Analysis/AffineStructures.cpp      | 34 +++++++---
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp       | 35 ++++++++---
 mlir/test/Dialect/Linalg/pad.mlir             | 63 +++++++++++++++++++
 5 files changed, 140 insertions(+), 21 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h b/mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h
index 875e405431961..427d96fef23a6 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/AffineStructures.h
@@ -166,8 +166,25 @@ class FlatAffineValueConstraints : public presburger::IntegerPolyhedron {
   /// bound map is expected to have exactly one result. In case of a LB/UB, the
   /// bound map may have more than one result, for each of which an inequality
   /// is added.
+  ///
+  /// The bound can be added as open or closed by specifying isClosedBound. In
+  /// case of a LB/UB, isClosedBound = false means the bound is added internally
+  /// as a closed bound by +1/-1 respectively. In case of an EQ bound, it can
+  /// only be added as a closed bound.
+  ///
   /// Note: The dimensions/symbols of this FlatAffineConstraints must match the
   /// dimensions/symbols of the affine map.
+  LogicalResult addBound(BoundType type, unsigned pos, AffineMap boundMap,
+                         bool isClosedBound);
+
+  /// Adds a bound for the identifier at the specified position with constraints
+  /// being drawn from the specified bound map. In case of an EQ bound, the
+  /// bound map is expected to have exactly one result. In case of a LB/UB, the
+  /// bound map may have more than one result, for each of which an inequality
+  /// is added.
+  /// Note: The dimensions/symbols of this FlatAffineConstraints must match the
+  /// dimensions/symbols of the affine map. By default the lower bound is closed
+  /// and the upper bound is open.
   LogicalResult addBound(BoundType type, unsigned pos, AffineMap boundMap);
 
   /// Adds a bound for the identifier at the specified position with constraints
@@ -197,9 +214,13 @@ class FlatAffineValueConstraints : public presburger::IntegerPolyhedron {
   /// identifiers as floordiv's and mod's of affine expressions of other
   /// identifiers with respect to (positive) constants. Sets bound map to a
   /// null AffineMap if such a bound can't be found (or yet unimplemented).
+  ///
+  /// By default the returned lower bounds are closed and upper bounds are open.
+  /// This can be changed by getClosedUB.
   void getSliceBounds(unsigned offset, unsigned num, MLIRContext *context,
                       SmallVectorImpl<AffineMap> *lbMaps,
-                      SmallVectorImpl<AffineMap> *ubMaps);
+                      SmallVectorImpl<AffineMap> *ubMaps,
+                      bool getClosedUB = false);
 
   /// Composes an affine map whose dimensions and symbols match one to one with
   /// the dimensions and symbols of this FlatAffineConstraints. The results of
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index bd427ea3d9e50..3e9d07209017c 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -50,6 +50,9 @@ SmallVector<Value, 4> getDynOperands(Location loc, Value val, OpBuilder &b);
 /// values. The method sets `boundMap` to an affine map that given
 /// `boundOperands` evaluates to an upper bound for the index computation.
 ///
+/// If constantRequired is true, only returns the constant bounds (potentially
+/// over-approximating) and fails when not possible.
+///
 /// Example:
 /// ```
 /// %dim0 = dim %tensor, %c0
@@ -62,7 +65,8 @@ SmallVector<Value, 4> getDynOperands(Location loc, Value val, OpBuilder &b);
 /// - boundMap = affine.map<(d0) -> (d0 + 40)>
 /// - boundOperands = [%dim1]
 void getUpperBoundForIndex(Value value, AffineMap &boundMap,
-                           SmallVectorImpl<Value> &boundOperands);
+                           SmallVectorImpl<Value> &boundOperands,
+                           bool constantRequired = false);
 
 /// Returns a constant upper bound for the result `value` of an index
 /// computation. Calls `getUpperBoundForIndex` and returns a constant upper
diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
index e31dcbd7498d1..619d9c7a9e7fc 100644
--- a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp
@@ -965,7 +965,8 @@ FlatAffineValueConstraints::getLowerAndUpperBound(
 /// this process if needed.
 void FlatAffineValueConstraints::getSliceBounds(
     unsigned offset, unsigned num, MLIRContext *context,
-    SmallVectorImpl<AffineMap> *lbMaps, SmallVectorImpl<AffineMap> *ubMaps) {
+    SmallVectorImpl<AffineMap> *lbMaps, SmallVectorImpl<AffineMap> *ubMaps,
+    bool getClosedUB) {
   assert(num < getNumDimIds() && "invalid range");
 
   // Basic simplification.
@@ -1065,6 +1066,8 @@ void FlatAffineValueConstraints::getSliceBounds(
     // again.
   } while (changed);
 
+  int64_t ubAdjustment = getClosedUB ? 0 : 1;
+
   // Set the lower and upper bound maps for all the identifiers that were
   // computed as affine expressions of the rest as the "detected expr" and
   // "detected expr + 1" respectively; set the undetected ones to null.
@@ -1081,7 +1084,7 @@ void FlatAffineValueConstraints::getSliceBounds(
 
     if (expr) {
       lbMap = AffineMap::get(numMapDims, numMapSymbols, expr);
-      ubMap = AffineMap::get(numMapDims, numMapSymbols, expr + 1);
+      ubMap = AffineMap::get(numMapDims, numMapSymbols, expr + ubAdjustment);
     } else {
       // TODO: Whenever there are local identifiers in the dependence
       // constraints, we'll conservatively over-approximate, since we don't
@@ -1118,9 +1121,10 @@ void FlatAffineValueConstraints::getSliceBounds(
                    << "WARNING: Potentially over-approximating slice ub\n");
         auto ubConst = getConstantBound(BoundType::UB, pos + offset);
         if (ubConst.hasValue()) {
-          (ubMap) = AffineMap::get(
-              numMapDims, numMapSymbols,
-              getAffineConstantExpr(ubConst.getValue() + 1, context));
+          ubMap =
+              AffineMap::get(numMapDims, numMapSymbols,
+                             getAffineConstantExpr(
+                                 ubConst.getValue() + ubAdjustment, context));
         }
       }
     }
@@ -1158,10 +1162,13 @@ LogicalResult FlatAffineValueConstraints::flattenAlignedMapAndMergeLocals(
 }
 
 LogicalResult FlatAffineValueConstraints::addBound(BoundType type, unsigned pos,
-                                                   AffineMap boundMap) {
+                                                   AffineMap boundMap,
+                                                   bool isClosedBound) {
   assert(boundMap.getNumDims() == getNumDimIds() && "dim mismatch");
   assert(boundMap.getNumSymbols() == getNumSymbolIds() && "symbol mismatch");
   assert(pos < getNumDimAndSymbolIds() && "invalid position");
+  assert((type != BoundType::EQ || isClosedBound) &&
+         "EQ bound must be closed.");
 
   // Equality follows the logic of lower bound except that we add an equality
   // instead of an inequality.
@@ -1193,17 +1200,24 @@ LogicalResult FlatAffineValueConstraints::addBound(BoundType type, unsigned pos,
     for (unsigned i = boundMap.getNumInputs(); i < end; i++, j++) {
       ineq[j] = lower ? -flatExpr[i] : flatExpr[i];
     }
+    // Make the bound closed in if flatExpr is open. The inequality is always
+    // created in the upper bound form, so the adjustment is -1.
+    int64_t boundAdjustment = (isClosedBound || type == BoundType::EQ) ? 0 : -1;
     // Constant term.
-    ineq[getNumCols() - 1] =
-        lower ? -flatExpr[flatExpr.size() - 1]
-              // Upper bound in flattenedExpr is an exclusive one.
-              : flatExpr[flatExpr.size() - 1] - 1;
+    ineq[getNumCols() - 1] = (lower ? -flatExpr[flatExpr.size() - 1]
+                                    : flatExpr[flatExpr.size() - 1]) +
+                             boundAdjustment;
     type == BoundType::EQ ? addEquality(ineq) : addInequality(ineq);
   }
 
   return success();
 }
 
+LogicalResult FlatAffineValueConstraints::addBound(BoundType type, unsigned pos,
+                                                   AffineMap boundMap) {
+  return addBound(type, pos, boundMap, /*isClosedBound=*/type != BoundType::UB);
+}
+
 AffineMap
 FlatAffineValueConstraints::computeAlignedMap(AffineMap map,
                                               ValueRange operands) const {
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 87af3b9e26a24..bfd2c68cfa68a 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -177,7 +177,8 @@ SmallVector<Value, 4> getDynOperands(Location loc, Value val, OpBuilder &b) {
 }
 
 void getUpperBoundForIndex(Value value, AffineMap &boundMap,
-                           SmallVectorImpl<Value> &boundOperands) {
+                           SmallVectorImpl<Value> &boundOperands,
+                           bool constantRequired) {
   // Initialize `boundMap` and `boundOperands` to the identity returning
   // `value`. This combination is the default result of the method if no
   // simplification is possible.
@@ -226,11 +227,13 @@ void getUpperBoundForIndex(Value value, AffineMap &boundMap,
     if (!(llvm::all_of(op->getResults(), findOrCreateId) &&
           llvm::all_of(op->getOperands(), findOrCreateId)))
       return;
+
     // Add AffineApplyOps to the constraints.
     if (auto applyOp = dyn_cast<AffineApplyOp>(op)) {
-      AffineValueMap valueMap(applyOp.getAffineMap(), applyOp.getOperands(),
-                              applyOp.getResult());
-      if (failed(constraints.composeMap(&valueMap)))
+      AffineMap map = constraints.computeAlignedMap(applyOp.getAffineMap(),
+                                                    applyOp.getOperands());
+      if (failed(constraints.addBound(IntegerPolyhedron::EQ,
+                                      getPosition(applyOp.getResult()), map)))
         return;
       continue;
     }
@@ -239,17 +242,30 @@ void getUpperBoundForIndex(Value value, AffineMap &boundMap,
     AffineMap map = constraints.computeAlignedMap(minOp.getAffineMap(),
                                                   minOp.getOperands());
     if (failed(constraints.addBound(IntegerPolyhedron::UB,
-                                    getPosition(minOp.getResult()), map)))
+                                    getPosition(minOp.getResult()), map,
+                                    /*isClosedBound=*/true)))
       return;
   }
 
   // Obtain an upper bound for the affine index computation by projecting out
   // all temporary results and expressing the upper bound for `value` in terms
   // of the terminals of the index computation.
-  SmallVector<AffineMap> lowerBounds(1), upperBounds(1);
-  constraints.getSliceBounds(getPosition(value), 1, value.getContext(),
-                             &lowerBounds, &upperBounds);
+  unsigned pos = getPosition(value);
+  if (constantRequired) {
+    auto ubConst = constraints.getConstantBound(
+        FlatAffineValueConstraints::BoundType::UB, pos);
+    if (!ubConst.hasValue())
+      return;
 
+    boundMap =
+        AffineMap::getConstantMap(ubConst.getValue(), value.getContext());
+    return;
+  }
+
+  SmallVector<AffineMap> lowerBounds(1), upperBounds(1);
+  constraints.getSliceBounds(pos, 1, value.getContext(), &lowerBounds,
+                             &upperBounds,
+                             /*getClosedUB=*/true);
   // Verify `upperBounds[0]` is valid and has at least one result.
   if (!upperBounds[0] || upperBounds[0].getNumResults() == 0)
     return;
@@ -265,7 +281,8 @@ FailureOr<int64_t> getConstantUpperBoundForIndex(Value value) {
   // Compute an upper bound for `value`.
   AffineMap boundMap;
   SmallVector<Value> boundOperands;
-  getUpperBoundForIndex(value, boundMap, boundOperands);
+  getUpperBoundForIndex(value, boundMap, boundOperands,
+                        /*constantRequired=*/true);
 
   // Search the results of `boundMap` for constant upper bounds.
   SmallVector<int64_t> constantBounds;
diff --git a/mlir/test/Dialect/Linalg/pad.mlir b/mlir/test/Dialect/Linalg/pad.mlir
index e37f0fc06f2d1..0e0e2e1066d6e 100644
--- a/mlir/test/Dialect/Linalg/pad.mlir
+++ b/mlir/test/Dialect/Linalg/pad.mlir
@@ -3,6 +3,7 @@
 // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad padding-values=0.:f32,0.:f32 pack-paddings=0,1 padding-dimensions=0,1,2 run-enable-pass=false" -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 pack-paddings=0,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=FILL-MATMUL
 // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32 pack-paddings=1,1,0 padding-dimensions=0,1,2 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=INPUTS-ONLY
 // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1 pack-paddings=1,1,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=PARTIAL
+// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.depthwise_conv_2d_nhwc_hwc pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=1,2 pack-paddings=1,0,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=DEPTHWISE_CONV_2D
 
 // MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 12, 7)>
 // MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)>
@@ -535,3 +536,65 @@ func.func @padding_the_output_dims_only(%arg0: tensor<24x12xf32>,
   %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [%0, %0] [1, 1] : tensor<?x?xf32> into tensor<24x25xf32>
   func.return %5 : tensor<24x25xf32>
 }
+
+// -----
+
+//  DEPTHWISE_CONV_2D-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (4, -s0 + 11)>
+//  DEPTHWISE_CONV_2D-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (s0 * 2)>
+//  DEPTHWISE_CONV_2D-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<()[s0] -> (s0 * 2 + 1)>
+//  DEPTHWISE_CONV_2D-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 * -2 + 8)>
+//  DEPTHWISE_CONV_2D-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 4)>
+
+#map0 = affine_map<()[s0] -> (4, -s0 + 11)>
+#map1 = affine_map<()[s0] -> (s0 * 2)>
+#map2 = affine_map<()[s0] -> (s0 * 2 + 1)>
+
+//      DEPTHWISE_CONV_2D: depthwise_conv_2d_padding
+// DEPTHWISE_CONV_2D-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<1x23x3x16xf32>
+// DEPTHWISE_CONV_2D-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<3x3x16xf32>
+// DEPTHWISE_CONV_2D-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<1x13x1x16xf32>
+// DEPTHWISE_CONV_2D-SAME: %[[IV0:[0-9a-zA-Z]*]]: index
+func.func @depthwise_conv_2d_padding(%arg0: tensor<1x23x3x16xf32>,
+                                     %arg1: tensor<3x3x16xf32>,
+                                     %arg2: tensor<1x13x1x16xf32>,
+                                     %iv0: index) -> tensor<1x?x1x16xf32> {
+  //  DEPTHWISE_CONV_2D-DAG: %[[CST:.*]] = arith.constant 0.
+  //  DEPTHWISE_CONV_2D-DAG: %[[C0:.*]] = arith.constant 0 : index
+  //  DEPTHWISE_CONV_2D-DAG: %[[T0:.*]] = affine.min #[[MAP0]]()[%[[IV0]]]
+  %0 = affine.min #map0()[%iv0]
+  %1 = affine.apply #map1()[%iv0]
+  %2 = affine.apply #map2()[%0]
+
+  //      DEPTHWISE_CONV_2D: %[[T3:.*]] = tensor.extract_slice %[[ARG0]]
+  //      DEPTHWISE_CONV_2D: %[[T4:.*]] = tensor.extract_slice %[[ARG2]]
+  %3 = tensor.extract_slice %arg0[0, %1, 0, 0] [1, %2, 3, 16] [1, 1, 1, 1] : tensor<1x23x3x16xf32> to tensor<1x?x3x16xf32>
+  %4 = tensor.extract_slice %arg2[0, %iv0, 0, 0] [1, %0, 1, 16] [1, 1, 1, 1] : tensor<1x13x1x16xf32> to tensor<1x?x1x16xf32>
+
+  // Check the padding on the input.
+  //      DEPTHWISE_CONV_2D: %[[T5:.*]] = affine.apply #[[MAP3]]()[%[[T0]]]
+  //      DEPTHWISE_CONV_2D: %[[T6:.*]] = tensor.pad %[[T3]]
+  // DEPTHWISE_CONV_2D-SAME:                low[%[[C0]], %[[C0]], %[[C0]], %[[C0]]]
+  // DEPTHWISE_CONV_2D-SAME:                high[%[[C0]], %[[T5]], %[[C0]], %[[C0]]]
+  //      DEPTHWISE_CONV_2D: tensor.yield %[[CST]] : f32
+
+  // Check the padding on the output.
+  //      DEPTHWISE_CONV_2D: %[[T7:.*]] = affine.apply #[[MAP4]]()[%[[T0]]]
+  //      DEPTHWISE_CONV_2D: %[[T8:.*]] = tensor.pad %[[T4]]
+  // DEPTHWISE_CONV_2D-SAME:                low[%[[C0]], %[[C0]], %[[C0]], %[[C0]]]
+  // DEPTHWISE_CONV_2D-SAME:                high[%[[C0]], %[[T7]], %[[C0]], %[[C0]]]
+  //      DEPTHWISE_CONV_2D: tensor.yield %[[CST]] : f32
+
+  //      DEPTHWISE_CONV_2D: %[[T9:.*]] = linalg.depthwise_conv_2d_nhwc_hwc
+  // DEPTHWISE_CONV_2D-SAME: ins(%[[T6]], %[[ARG1]] : tensor<1x9x3x16xf32>, tensor<3x3x16xf32>)
+  // DEPTHWISE_CONV_2D-SAME: outs(%[[T8]] : tensor<1x4x1x16xf32>)
+  %5 = linalg.depthwise_conv_2d_nhwc_hwc
+      {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
+      ins(%3, %arg1 : tensor<1x?x3x16xf32>, tensor<3x3x16xf32>)
+      outs(%4 : tensor<1x?x1x16xf32>) -> tensor<1x?x1x16xf32>
+
+  // Check the extract_slice to crop the padded output before return.
+  //      DEPTHWISE_CONV_2D: %[[T10:.*]] = tensor.extract_slice %[[T9]][0, 0, 0, 0]
+  // DEPTHWISE_CONV_2D-SAME:                 [1, %[[T0]], 1, 16]
+  //      DEPTHWISE_CONV_2D: return %[[T10]] : tensor<1x?x1x16xf32>
+  return %5 : tensor<1x?x1x16xf32>
+}