Squashed commit of the following:

commit 11e8fdae41e596d6102e46c37a22a26c94d7fe85 Author: Mason Remy <masonr@microsoft.com> Date: Thu Mar 2 05:53:10 2023 +0000 Merged PR 3131: Set masked load/store inbounds flag to true Set masked load/store inbounds flag to true The mask we generate, as well as the rest of our infrastructure, will prevent out-of-bounds accesses when used properly. Therefore for performance reasons we don't want MLIR to generate runtime bounds checking commit 14a04925721ed575befc65e93e4670e27e4d1063 Author: Mason Remy <masonr@microsoft.com> Date: Thu Mar 2 00:28:38 2023 +0000 Merged PR 3130: Recognize and simplify always true EQ and NE CmpOps Recognize and simplify always true EQ and NE CmpOps These would already get simplified after converting to the builtin dialects, but this makes them happen earlier in the lowering commit 91b76428c61a52d454ac5ae8fa6485edd9bdfbe5 Author: Mason Remy <masonr@microsoft.com> Date: Wed Mar 1 23:46:29 2023 +0000 Merged PR 3129: Optimize 1-row horizontal i16->i32 sum reduction Optimize 1-row horizontal i16->i32 sum reduction commit be987bcf641c09dd43d959cc7e8a1b37d33ba591 Author: JUBI TANEJA <jubitaneja@microsoft.com> Date: Wed Mar 1 19:59:34 2023 +0000 Merged PR 3118: vectorize accumulation of results of two masked load ops This PR vectorizes a pattern that occurs in MMIF where there are two conditional loads, followed by an accumulation operation, and a conditional store. On vectorizing the following DSL: ``` N_input = 8 N_output = 5 Input = Array(role=Role.INPUT, element_type=ScalarType.int32, shape=(N_input, )) Output = Array(role=Role.INPUT_OUTPUT, element_type=ScalarType.int32, shape=(N_output, )) nest = Nest(shape=(N_input, )) i, = nest.get_indices() @nest.iteration_logic def _nest(): def store_value(): Output[i] += Input[i] _If(i < N_output, store_value) ``` It produces the following assembly. We are looking for `vpmaskmovd` instructions that correspond to vector.transfer_read/vector.transfer_write ops in MLIR. ``` 0000000000000030 <test_vectorized_masked_accumulate_3e5de44f3dcca64e>: 30: c5 fd 6f 05 00 00 00 vmovdqa 0x0(%rip),%ymm0 # 38 <test_vectorized_masked_accumulate_3e5de44f3dcca64e+0x8> 37: 00 38: c4 e2 7d 8c 0e vpmaskmovd (%rsi),%ymm0,%ymm1 3d: c4 e2 7d 8c 17 vpmaskmovd (%rdi),%ymm0,%ymm2 42: c5 ed fe c9 vpaddd %ymm1,%ymm2,%ymm1 46: c4 e2 7d 8e 0e vpmaskmovd %ymm1,%ymm0,(%rsi) 4b: c5 f8 77 vzeroupper 4e: c3 retq ``` commit 69b87522136cae60b0f5b4d62919a2ebd5577933 Author: Kern Handa <kerha@microsoft.com> Date: Wed Mar 1 17:47:14 2023 +0000 Merged PR 3126: [test] Adds more tests for vectorized transpose [test] Adds more tests for vectorized transpose commit c4d81701faf3351218cd69726c487f642e4bfca0 Author: Mason Remy <masonr@microsoft.com> Date: Wed Mar 1 06:48:35 2023 +0000 Merged PR 3121: [nfc] Separate bounds checking into separate pass file [nfc] Separate bounds checking into separate pass file This removes the bounds checking code from ExecutionPlanToAffineLoweringPass and creates a separate pass file for it. There is no change in when and where the checking occurs (currently it only happens for caching-generated loads and stores). In a future change we will further separate the pass and run it at a different phase of the lowering and plumb controls for enabling/disabling it to the DSL commit b221544937f8776d48a8f9daddf601378534705b Author: Mason Remy <masonr@microsoft.com> Date: Wed Mar 1 01:18:59 2023 +0000 Merged PR 3122: Fix reinterpret_cast output memref shape Fix reinterpret_cast output memref shape commit eb3582ba07cb4118f73bb630589f07de27ba9c50 Author: Mason Remy <masonr@microsoft.com> Date: Fri Feb 24 23:51:30 2023 +0000 Merged PR 3115: Normalize AffineForOps to have unit stride and begin at 0 Normalize AffineForOps to have unit stride and begin at 0 commit 3ec2bd7f5353a4119294095eb5084a1e7a298051 Author: Mason Remy <masonr@microsoft.com> Date: Fri Feb 24 22:26:13 2023 +0000 Merged PR 3117: Vectorize horizontal multi-dim sum reductions Vectorize horizontal multi-dim sum reductions Recognizes and vectorizes these sum reductions: 4x16xi16 -> 4x1xi32 4x8xi32 -> 4x1xi32 4x8xf32 -> 4x1xf32 commit 6f46df5ba99eeb237dcbbdda28a0975964af1186 Author: Kern Handa <kerha@microsoft.com> Date: Fri Feb 24 11:13:45 2023 +0000 Merged PR 3099: Adds pattern rewriting for AVX2 vectorized transpose
microsoft · Mar 2, 2023 · 05f8c0d · 05f8c0d
1 parent 604e745
commit 05f8c0d
Show file tree

Hide file tree

Showing 22 changed files with 2,745 additions and 724 deletions.
diff --git a/accera/acc-opt/test/thrifty_caching.mlir b/accera/acc-opt/test/thrifty_caching.mlir
@@ -69,8 +69,8 @@ module @test_thrifty_caching_simple_input_cache attributes {llvm.data_layout = "
 // CHECK:             affine.for %arg6 = 0 to 16 {
 // CHECK:               %1 = affine.load %arg1[%arg5, %arg4 + %arg6] : memref<32x32xf32, #map0>
 // CHECK:               affine.store %1, %0[%arg5, %arg6] : memref<32x16xf32, 3>
-// CHECK:             } {accxp.access_bounds_check, beginMap = #map1, endMap = #map2, index = #accln<"index{j,7}">, kernels = ["cache_internal_loopnest_kernel_active_block_copy"], operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, scheduledIndex = #accln<"index{j,7}">, subdomainIndexOrder = [#accln<"index{i,6}">, #accln<"index{j,7}">], subdomainSize = [32, 16]}
-// CHECK:           } {accxp.access_bounds_check, beginMap = #map1, endMap = #map3, index = #accln<"index{i,6}">, operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, scheduledIndex = #accln<"index{i,6}">, subdomainIndexOrder = [#accln<"index{i,6}">, #accln<"index{j,7}">], subdomainSize = [32, 16]}
+// CHECK:             } {accaffine.access_bounds_check, beginMap = #map1, endMap = #map2, index = #accln<"index{j,7}">, kernels = ["cache_internal_loopnest_kernel_active_block_copy"], operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, scheduledIndex = #accln<"index{j,7}">, subdomainIndexOrder = [#accln<"index{i,6}">, #accln<"index{j,7}">], subdomainSize = [32, 16]}
+// CHECK:           } {accaffine.access_bounds_check, beginMap = #map1, endMap = #map3, index = #accln<"index{i,6}">, operand_segment_sizes = dense<[0, 0, 1]> : vector<3xi32>, scheduledIndex = #accln<"index{i,6}">, subdomainIndexOrder = [#accln<"index{i,6}">, #accln<"index{j,7}">], subdomainSize = [32, 16]}
 // CHECK:           affine.for %arg5 = 0 to 4 {
 // CHECK:             affine.for %arg6 = 0 to 16 {
 // CHECK:               affine.for %arg7 = 0 to 32 {

diff --git a/accera/acc-opt/test/vectorization.mlir b/accera/acc-opt/test/vectorization.mlir
diff --git a/accera/ir/include/IRUtil.h b/accera/ir/include/IRUtil.h
@@ -463,5 +463,7 @@ namespace util
 
     std::vector<mlir::Value> GetDynamicOffsetSymbols(mlir::Value val);
 
+    bool AncestorOpContainsAttrOfName(mlir::Operation* op, const mlir::StringRef& name);
+
 } // namespace util
 } // namespace accera::ir
diff --git a/accera/ir/include/exec/ExecutionPlanOps.h b/accera/ir/include/exec/ExecutionPlanOps.h
@@ -100,9 +100,6 @@ namespace executionPlan
 namespace accera::ir::executionPlan
 {
 
-// Unit attr name for controlling whether bounds checking is done for ops within a marked op
-const mlir::StringRef AccessBoundsCheckAttrName = "accxp.access_bounds_check";
-
 //
 // Utility functions and EDSC-type intrinsics
 //

diff --git a/accera/ir/include/value/ValueOps.td b/accera/ir/include/value/ValueOps.td
@@ -1577,5 +1577,42 @@ def accv_vminps : accv_Op<"vminps", [NoSideEffect]>{
   let results = (outs AnyVector:$result);
 }
 
+def accv_vhadd : accv_Op<"vhadd", [NoSideEffect, SameOperandsAndResultShape, SameOperandsAndResultType]>{
+  let summary = "Vector horizontal interleaved add operation";
+
+  let description = [{
+    The `accv.vhadd` operation lowers to differently sized vector instructions depending on the element type in the vector operands.
+
+    For 32-bit operands the interleaving and adding follows the pattern:
+      vhadd( A[0...7], B[0...7] ) ->
+        [ A[0]+A[1],
+          A[2]+A[3],
+          B[0]+B[1],
+          B[2]+B[3],
+          A[4]+A[5],
+          A[6]+A[7],
+          B[4]+B[5],
+          B[6]+B[7] ]
+
+    For different bit-width operands, the corresponding byte positions are kept consistent as with the 32-bit operands.
+    i.e. for i16, the first 4 elements are the pairwise sums of the first 8 elements of A, as opposed to 2 and 4 with i32's
+         for f64, the first 1 element is the sum of the first 2 elements of A, as opposed to 2 and 4 with f32's
+
+    Supported operand / result types and their corresponding AVX instructions:
+    Operand / Result Type | Instruction
+    ----------------------|-------------
+       vector<8xi32>      |   vphaddd
+       vector<16xi16>     |   vphaddw
+       vector<8xf32>      |   vhaddps
+       vector<4xf64>      |   vhaddpd
+
+    Note: this lowers to MLIR vector dialect ops, so a particular target architecture is not required,
+          and instructions other than those listed above are possible on other architectures
+  }];
+
+  let arguments = (ins AnyVector:$lhs, AnyVector:$rhs);
+  let results = (outs AnyVector:$result);
+}
+
 
 #endif // ACCERA_accv_OPS
diff --git a/accera/ir/src/IRUtil.cpp b/accera/ir/src/IRUtil.cpp
@@ -1540,5 +1540,18 @@ namespace util
         return offsetSymbols;
     }
 
+    bool AncestorOpContainsAttrOfName(mlir::Operation* op, const mlir::StringRef& name)
+    {
+        while (op != nullptr)
+        {
+            if (op->getAttr(name) != nullptr)
+            {
+                return true;
+            }
+            op = op->getParentOp();
+        }
+        return false;
+    }
+
 } // namespace util
 } // namespace accera::ir
diff --git a/accera/python/accera/test/dsl_tests.py b/accera/python/accera/test/dsl_tests.py
@@ -65,7 +65,7 @@ def _get_test_mode(correctness_check: bool = False):
 
 class DSLTest_01Arrays(unittest.TestCase):
 
-    def _verify_nest(self, nest, args: Tuple[Array], package_name, correctness_check_values=None) -> None:
+    def _verify_nest(self, nest, args: Tuple[Array], package_name, correctness_check_values=None, quiet=True) -> None:
 
         # create a HAT package and add the function to it
         package = Package()
@@ -74,7 +74,7 @@ def _verify_nest(self, nest, args: Tuple[Array], package_name, correctness_check
 
         # build the HAT package
         with verifiers.VerifyPackage(self, package_name, output_dir) as v:
-            package.build(package_name, format=TEST_FORMAT, mode=_get_test_mode(correctness_check_values), output_dir=output_dir)
+            package.build(package_name, format=TEST_FORMAT, mode=_get_test_mode(correctness_check_values), output_dir=output_dir, _quiet=quiet)
             if correctness_check_values:
                 v.check_correctness(
                     function.name,
@@ -667,13 +667,13 @@ def main(array):
         package.add(main, args=(arr, ))
 
         package_name = "test_reinterpret_cast"
-
-        with verifiers.VerifyPackage(self, package_name, TEST_PACKAGE_DIR):
+        output_dir = pathlib.Path(TEST_PACKAGE_DIR) / package_name
+        with verifiers.VerifyPackage(self, package_name, output_dir):
             package.build(
                 package_name,
                 format=TEST_FORMAT,
                 mode=Package.Mode.RELEASE,
-                output_dir=TEST_PACKAGE_DIR,
+                output_dir=output_dir,
                 _quiet=False
             )
 
@@ -6131,6 +6131,7 @@ def _():
                 function.name,
                 before=[A_test, B_test, C_test],
                 after=[A_test, B_test, (C_test + A_test) * B_test - 1.0],
+                tolerance=1e-4
             )
 
     def test_debug_mode_fusion_cascading_2(self) -> None: