| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1,36 @@ | ||
| config.name = "NSan" + config.name_suffix | ||
|
|
||
| # Setup source root. | ||
| config.test_source_root = os.path.dirname(__file__) | ||
|
|
||
| # Test suffixes. | ||
| config.suffixes = [".c", ".cpp", ".test"] | ||
|
|
||
| # C & CXX flags. | ||
| c_flags = [config.target_cflags] | ||
|
|
||
| # CXX flags | ||
| cxx_flags = c_flags + config.cxx_mode_flags + ["-std=c++17"] | ||
|
|
||
| nsan_flags = [ | ||
| "-fsanitize=numerical", | ||
| "-g", | ||
| "-mno-omit-leaf-frame-pointer", | ||
| "-fno-omit-frame-pointer", | ||
| ] | ||
|
|
||
|
|
||
| def build_invocation(compile_flags): | ||
| return " " + " ".join([config.clang] + compile_flags) + " " | ||
|
|
||
|
|
||
| # Add substitutions. | ||
| config.substitutions.append(("%clang ", build_invocation(c_flags))) | ||
| config.substitutions.append(("%clang_nsan ", build_invocation(c_flags + nsan_flags))) | ||
| config.substitutions.append( | ||
| ("%clangxx_nsan ", build_invocation(cxx_flags + nsan_flags)) | ||
| ) | ||
|
|
||
| # NSan tests are currently supported on Linux only. | ||
| if config.host_os not in ["Linux"]: | ||
| config.unsupported = True |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,70 @@ | ||
| // RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=NaiveSum -DFLT=float %s -o %t | ||
| // RUN: NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 not %run %t 2>&1 | FileCheck %s | ||
|
|
||
| // RUN: %clangxx_nsan -O3 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=NaiveSum -DFLT=float %s -o %t | ||
| // RUN: NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 not %run %t 2>&1 | FileCheck %s | ||
|
|
||
| // RUN: %clangxx_nsan -O0 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=KahanSum -DFLT=float %s -o %t | ||
| // RUN: NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 %run %t | ||
|
|
||
| // RUN: %clangxx_nsan -O3 -mllvm -nsan-shadow-type-mapping=dqq -g -DSUM=KahanSum -DFLT=float %s -o %t | ||
| // RUN: NSAN_OPTIONS=halt_on_error=1,log2_max_relative_error=19 %run %t | ||
|
|
||
| #include <chrono> | ||
| #include <iostream> | ||
| #include <random> | ||
| #include <vector> | ||
|
|
||
| // A naive, unstable summation. | ||
| template <typename T> | ||
| __attribute__((noinline)) // To check call stack reporting. | ||
| T NaiveSum(const std::vector<T>& values) { | ||
| T sum = 0; | ||
| for (T v : values) { | ||
| sum += v; | ||
| } | ||
| return sum; | ||
| // CHECK: WARNING: NumericalStabilitySanitizer: inconsistent shadow results while checking return | ||
| // CHECK: float{{ *}}precision (native): | ||
| // CHECK: double{{ *}}precision (shadow): | ||
| // CHECK: {{#0 .*in .* NaiveSum}} | ||
| } | ||
|
|
||
| // Kahan's summation is a numerically stable sum. | ||
| // https://en.wikipedia.org/wiki/Kahan_summation_algorithm | ||
| template <typename T> | ||
| __attribute__((noinline)) T KahanSum(const std::vector<T> &values) { | ||
| T sum = 0; | ||
| T c = 0; | ||
| for (T v : values) { | ||
| T y = v - c; | ||
| T t = sum + y; | ||
| c = (t - sum) - y; | ||
| sum = t; | ||
| } | ||
| return sum; | ||
| } | ||
|
|
||
| int main() { | ||
| std::vector<FLT> values; | ||
| constexpr int kNumValues = 1000000; | ||
| values.reserve(kNumValues); | ||
| // Using a seed to avoid flakiness. | ||
| constexpr uint32_t kSeed = 0x123456; | ||
| std::mt19937 gen(kSeed); | ||
| std::uniform_real_distribution<FLT> dis(0.0f, 1000.0f); | ||
| for (int i = 0; i < kNumValues; ++i) { | ||
| values.push_back(dis(gen)); | ||
| } | ||
|
|
||
| const auto t1 = std::chrono::high_resolution_clock::now(); | ||
| const auto sum = SUM(values); | ||
| const auto t2 = std::chrono::high_resolution_clock::now(); | ||
| printf("sum: %.8f\n", sum); | ||
| std::cout << "runtime: " | ||
| << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1) | ||
| .count() / | ||
| 1000.0 | ||
| << "ms\n"; | ||
| return 0; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,115 @@ | ||
| //===- CombinerHelperCasts.cpp---------------------------------------------===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // This file implements CombinerHelper for G_ANYEXT, G_SEXT, G_TRUNC, and | ||
| // G_ZEXT | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" | ||
| #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" | ||
| #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" | ||
| #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" | ||
| #include "llvm/CodeGen/GlobalISel/Utils.h" | ||
| #include "llvm/CodeGen/LowLevelTypeUtils.h" | ||
| #include "llvm/CodeGen/MachineOperand.h" | ||
| #include "llvm/CodeGen/MachineRegisterInfo.h" | ||
| #include "llvm/CodeGen/TargetOpcodes.h" | ||
| #include "llvm/Support/Casting.h" | ||
|
|
||
| #define DEBUG_TYPE "gi-combiner" | ||
|
|
||
| using namespace llvm; | ||
|
|
||
| bool CombinerHelper::matchSextOfTrunc(const MachineOperand &MO, | ||
| BuildFnTy &MatchInfo) { | ||
| GSext *Sext = cast<GSext>(getDefIgnoringCopies(MO.getReg(), MRI)); | ||
| GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Sext->getSrcReg(), MRI)); | ||
|
|
||
| Register Dst = Sext->getReg(0); | ||
| Register Src = Trunc->getSrcReg(); | ||
|
|
||
| LLT DstTy = MRI.getType(Dst); | ||
| LLT SrcTy = MRI.getType(Src); | ||
|
|
||
| if (DstTy == SrcTy) { | ||
| MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); }; | ||
| return true; | ||
| } | ||
|
|
||
| if (DstTy.getScalarSizeInBits() < SrcTy.getScalarSizeInBits() && | ||
| isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) { | ||
| MatchInfo = [=](MachineIRBuilder &B) { | ||
| B.buildTrunc(Dst, Src, MachineInstr::MIFlag::NoSWrap); | ||
| }; | ||
| return true; | ||
| } | ||
|
|
||
| if (DstTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits() && | ||
| isLegalOrBeforeLegalizer({TargetOpcode::G_SEXT, {DstTy, SrcTy}})) { | ||
| MatchInfo = [=](MachineIRBuilder &B) { B.buildSExt(Dst, Src); }; | ||
| return true; | ||
| } | ||
|
|
||
| return false; | ||
| } | ||
|
|
||
| bool CombinerHelper::matchZextOfTrunc(const MachineOperand &MO, | ||
| BuildFnTy &MatchInfo) { | ||
| GZext *Zext = cast<GZext>(getDefIgnoringCopies(MO.getReg(), MRI)); | ||
| GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Zext->getSrcReg(), MRI)); | ||
|
|
||
| Register Dst = Zext->getReg(0); | ||
| Register Src = Trunc->getSrcReg(); | ||
|
|
||
| LLT DstTy = MRI.getType(Dst); | ||
| LLT SrcTy = MRI.getType(Src); | ||
|
|
||
| if (DstTy == SrcTy) { | ||
| MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, Src); }; | ||
| return true; | ||
| } | ||
|
|
||
| if (DstTy.getScalarSizeInBits() < SrcTy.getScalarSizeInBits() && | ||
| isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {DstTy, SrcTy}})) { | ||
| MatchInfo = [=](MachineIRBuilder &B) { | ||
| B.buildTrunc(Dst, Src, MachineInstr::MIFlag::NoUWrap); | ||
| }; | ||
| return true; | ||
| } | ||
|
|
||
| if (DstTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits() && | ||
| isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {DstTy, SrcTy}})) { | ||
| MatchInfo = [=](MachineIRBuilder &B) { | ||
| B.buildZExt(Dst, Src, MachineInstr::MIFlag::NonNeg); | ||
| }; | ||
| return true; | ||
| } | ||
|
|
||
| return false; | ||
| } | ||
|
|
||
| bool CombinerHelper::matchNonNegZext(const MachineOperand &MO, | ||
| BuildFnTy &MatchInfo) { | ||
| GZext *Zext = cast<GZext>(MRI.getVRegDef(MO.getReg())); | ||
|
|
||
| Register Dst = Zext->getReg(0); | ||
| Register Src = Zext->getSrcReg(); | ||
|
|
||
| LLT DstTy = MRI.getType(Dst); | ||
| LLT SrcTy = MRI.getType(Src); | ||
| const auto &TLI = getTargetLowering(); | ||
|
|
||
| // Convert zext nneg to sext if sext is the preferred form for the target. | ||
| if (isLegalOrBeforeLegalizer({TargetOpcode::G_SEXT, {DstTy, SrcTy}}) && | ||
| TLI.isSExtCheaperThanZExt(getMVTForLLT(SrcTy), getMVTForLLT(DstTy))) { | ||
| MatchInfo = [=](MachineIRBuilder &B) { B.buildSExt(Dst, Src); }; | ||
| return true; | ||
| } | ||
|
|
||
| return false; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \ | ||
| # RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \ | ||
| # RUN: | FileCheck %s | ||
| # REQUIRES: asserts | ||
|
|
||
| # Test that checks no window scheduler is performed if the II set by pragma was | ||
| # enabled | ||
|
|
||
| # CHECK: Window scheduling is disabled when llvm.loop.pipeline.initiationinterval is set. | ||
|
|
||
| --- | | ||
| define void @test_pragma_ii_fail(ptr %a0, i32 %a1) { | ||
| b0: | ||
| %v0 = icmp sgt i32 %a1, 1 | ||
| br i1 %v0, label %b1, label %b4 | ||
|
|
||
| b1: ; preds = %b0 | ||
| %v1 = load i32, ptr %a0, align 4 | ||
| %v2 = add i32 %v1, 10 | ||
| %v4 = add i32 %a1, -1 | ||
| %cgep = getelementptr i32, ptr %a0, i32 1 | ||
| br label %b2 | ||
|
|
||
| b2: ; preds = %b2, %b1 | ||
| %v5 = phi i32 [ %v12, %b2 ], [ %v4, %b1 ] | ||
| %v6 = phi ptr [ %cgep2, %b2 ], [ %cgep, %b1 ] | ||
| %v7 = phi i32 [ %v10, %b2 ], [ %v2, %b1 ] | ||
| store i32 %v7, ptr %v6, align 4 | ||
| %v8 = add i32 %v7, 10 | ||
| %cgep1 = getelementptr i32, ptr %v6, i32 -1 | ||
| store i32 %v8, ptr %cgep1, align 4 | ||
| %v10 = add i32 %v7, 10 | ||
| %v12 = add i32 %v5, -1 | ||
| %v13 = icmp eq i32 %v12, 0 | ||
| %cgep2 = getelementptr i32, ptr %v6, i32 1 | ||
| br i1 %v13, label %b4, label %b2, !llvm.loop !0 | ||
|
|
||
| b4: ; preds = %b2, %b0 | ||
| ret void | ||
| } | ||
|
|
||
| !0 = distinct !{!0, !1} | ||
| !1 = !{!"llvm.loop.pipeline.initiationinterval", i32 2} | ||
| ... | ||
| --- | ||
| name: test_pragma_ii_fail | ||
| tracksRegLiveness: true | ||
| body: | | ||
| bb.0.b0: | ||
| successors: %bb.1(0x40000000), %bb.3(0x40000000) | ||
| liveins: $r0, $r1 | ||
| %0:intregs = COPY $r1 | ||
| %1:intregs = COPY $r0 | ||
| %2:predregs = C2_cmpgti %0, 1 | ||
| J2_jumpf %2, %bb.3, implicit-def dead $pc | ||
| J2_jump %bb.1, implicit-def dead $pc | ||
| bb.1.b1: | ||
| successors: %bb.2(0x80000000) | ||
| %3:intregs, %4:intregs = L2_loadri_pi %1, 4 | ||
| %5:intregs = A2_addi killed %3, 10 | ||
| %6:intregs = A2_addi %0, -1 | ||
| %7:intregs = COPY %6 | ||
| J2_loop0r %bb.2, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr | ||
| bb.2.b2 (machine-block-address-taken): | ||
| successors: %bb.3(0x04000000), %bb.2(0x7c000000) | ||
| %8:intregs = PHI %4, %bb.1, %9, %bb.2 | ||
| %10:intregs = PHI %5, %bb.1, %11, %bb.2 | ||
| S2_storeri_io %8, 0, %10 | ||
| %11:intregs = A2_addi %10, 10 | ||
| S2_storeri_io %8, -4, %11 | ||
| %9:intregs = A2_addi %8, 4 | ||
| ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 | ||
| J2_jump %bb.3, implicit-def dead $pc | ||
| bb.3.b4: | ||
| PS_jmpret $r31, implicit-def dead $pc | ||
| ... |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| # REQUIRES: asserts | ||
| # RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \ | ||
| # RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \ | ||
| # RUN: | FileCheck %s | ||
|
|
||
| # CHECK-NOT: Can't find a valid II. Keep searching... | ||
| # CHECK: Start analyzing II | ||
| # CHECK: Start scheduling Phis | ||
| # CHECK: Current window Offset is {{[0-9]+}} and II is {{[0-9]+}} | ||
|
|
||
| --- | ||
| name: relu | ||
| tracksRegLiveness: true | ||
| body: | | ||
| bb.0: | ||
| successors: %bb.2(0x30000000), %bb.1(0x50000000) | ||
| liveins: $r0, $r1, $r2 | ||
| %0:intregs = COPY $r2 | ||
| %1:intregs = COPY $r1 | ||
| %2:intregs = COPY $r0 | ||
| %3:predregs = C2_cmpeqi %2, 0 | ||
| J2_jumpt killed %3, %bb.2, implicit-def dead $pc | ||
| J2_jump %bb.1, implicit-def dead $pc | ||
| bb.1: | ||
| successors: %bb.3(0x80000000) | ||
| %4:hvxvr = V6_vd0 | ||
| %5:intregs = A2_addi %2, 31 | ||
| %6:intregs = S2_lsr_i_r %5, 5 | ||
| %7:intregs = COPY %6 | ||
| J2_loop0r %bb.3, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr | ||
| J2_jump %bb.3, implicit-def dead $pc | ||
| bb.2: | ||
| PS_jmpret $r31, implicit-def dead $pc | ||
| bb.3 (machine-block-address-taken): | ||
| successors: %bb.3(0x7c000000), %bb.2(0x04000000) | ||
| %8:intregs = PHI %1, %bb.1, %9, %bb.3 | ||
| %10:intregs = PHI %0, %bb.1, %14, %bb.3 | ||
| %11:hvxvr, %9:intregs = V6_vL32b_pi %8, 128 | ||
| %12:intregs = COPY %10 | ||
| %13:hvxvr = V6_vmaxw killed %11, %4 | ||
| %14:intregs = V6_vS32b_pi %12, 128, killed %13 | ||
| ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 | ||
| J2_jump %bb.2, implicit-def dead $pc | ||
| ... | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,139 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
| ; RUN: opt < %s -passes=instcombine -S | FileCheck %s | ||
|
|
||
| define float @remquo_f32(ptr %quo) { | ||
| ; CHECK-LABEL: define float @remquo_f32( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: store i32 -2, ptr [[QUO]], align 4 | ||
| ; CHECK-NEXT: ret float 1.000000e+00 | ||
| ; | ||
| entry: | ||
| %call = call float @remquof(float -5.000000e+00, float 3.000000e+00, ptr %quo) | ||
| ret float %call | ||
| } | ||
|
|
||
| define float @remquo_f32_quo_sign(ptr %quo) { | ||
| ; CHECK-LABEL: define float @remquo_f32_quo_sign( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: store i32 2, ptr [[QUO]], align 4 | ||
| ; CHECK-NEXT: ret float -1.000000e+00 | ||
| ; | ||
| entry: | ||
| %call = call float @remquof(float 5.000000e+00, float 3.000000e+00, ptr %quo) | ||
| ret float %call | ||
| } | ||
|
|
||
| define float @remquo_f32_round(ptr %quo) { | ||
| ; CHECK-LABEL: define float @remquo_f32_round( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: store i32 -6, ptr [[QUO]], align 4 | ||
| ; CHECK-NEXT: ret float 0xBFC9999900000000 | ||
| ; | ||
| entry: | ||
| %call = call float @remquof(float -5.000000e+00, float 0x3FE99999A0000000, ptr %quo) | ||
| ret float %call | ||
| } | ||
|
|
||
| define double @remquo_f64(ptr %quo) { | ||
| ; CHECK-LABEL: define double @remquo_f64( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: store i32 -5, ptr [[QUO]], align 4 | ||
| ; CHECK-NEXT: ret double -0.000000e+00 | ||
| ; | ||
| entry: | ||
| %call = call double @remquo(double -5.000000e+00, double 1.000000e+00, ptr %quo) | ||
| ret double %call | ||
| } | ||
|
|
||
| ; Negative tests | ||
|
|
||
| define float @remquo_f32_inf_x(ptr %quo) { | ||
| ; CHECK-LABEL: define float @remquo_f32_inf_x( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float 0x7FF0000000000000, float 1.000000e+00, ptr [[QUO]]) | ||
| ; CHECK-NEXT: ret float [[CALL]] | ||
| ; | ||
| entry: | ||
| %call = call float @remquof(float 0x7FF0000000000000, float 1.000000e+00, ptr %quo) | ||
| ret float %call | ||
| } | ||
|
|
||
| define float @remquo_f32_zero_y(ptr %quo) { | ||
| ; CHECK-LABEL: define float @remquo_f32_zero_y( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float -5.000000e+00, float 0.000000e+00, ptr [[QUO]]) | ||
| ; CHECK-NEXT: ret float [[CALL]] | ||
| ; | ||
| entry: | ||
| %call = call float @remquof(float -5.000000e+00, float 0.000000e+00, ptr %quo) | ||
| ret float %call | ||
| } | ||
|
|
||
| define float @remquo_f32_nzero_y(ptr %quo) { | ||
| ; CHECK-LABEL: define float @remquo_f32_nzero_y( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float -5.000000e+00, float -0.000000e+00, ptr [[QUO]]) | ||
| ; CHECK-NEXT: ret float [[CALL]] | ||
| ; | ||
| entry: | ||
| %call = call float @remquof(float -5.000000e+00, float -0.000000e+00, ptr %quo) | ||
| ret float %call | ||
| } | ||
|
|
||
| define float @remquo_f32_nan_x(ptr %quo) { | ||
| ; CHECK-LABEL: define float @remquo_f32_nan_x( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float 0x7FF8000000000000, float 1.000000e+00, ptr [[QUO]]) | ||
| ; CHECK-NEXT: ret float [[CALL]] | ||
| ; | ||
| entry: | ||
| %call = call float @remquof(float 0x7FF8000000000000, float 1.000000e+00, ptr %quo) | ||
| ret float %call | ||
| } | ||
|
|
||
| define float @remquo_f32_nan_y(ptr %quo) { | ||
| ; CHECK-LABEL: define float @remquo_f32_nan_y( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float 1.000000e+00, float 0x7FF8000000000000, ptr [[QUO]]) | ||
| ; CHECK-NEXT: ret float [[CALL]] | ||
| ; | ||
| entry: | ||
| %call = call float @remquof(float 1.000000e+00, float 0x7FF8000000000000, ptr %quo) | ||
| ret float %call | ||
| } | ||
|
|
||
| define float @remquo_f32_strictfp(ptr %quo) strictfp { | ||
| ; CHECK-LABEL: define float @remquo_f32_strictfp( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) #[[ATTR0:[0-9]+]] { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float -5.000000e+00, float 3.000000e+00, ptr [[QUO]]) #[[ATTR0]] | ||
| ; CHECK-NEXT: ret float [[CALL]] | ||
| ; | ||
| entry: | ||
| %call = call float @remquof(float -5.000000e+00, float 3.000000e+00, ptr %quo) strictfp | ||
| ret float %call | ||
| } | ||
|
|
||
| define float @remquo_f32_zero_y_strictfp(ptr %quo) strictfp { | ||
| ; CHECK-LABEL: define float @remquo_f32_zero_y_strictfp( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) #[[ATTR0]] { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: [[CALL:%.*]] = call float @remquof(float -5.000000e+00, float 0.000000e+00, ptr [[QUO]]) #[[ATTR0]] | ||
| ; CHECK-NEXT: ret float [[CALL]] | ||
| ; | ||
| entry: | ||
| %call = call float @remquof(float -5.000000e+00, float 0.000000e+00, ptr %quo) strictfp | ||
| ret float %call | ||
| } | ||
|
|
||
| declare float @remquof(float, float, ptr) | ||
| declare double @remquo(double, double, ptr) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
| ; RUN: opt < %s -passes=instcombine -S | FileCheck %s | ||
|
|
||
| define fp128 @remquo_fp128(ptr %quo) { | ||
| ; CHECK-LABEL: define fp128 @remquo_fp128( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: store i32 -2, ptr [[QUO]], align 4 | ||
| ; CHECK-NEXT: ret fp128 0xL00000000000000003FFF000000000000 | ||
| ; | ||
| entry: | ||
| %call = call fp128 @remquol(fp128 0xL0000000000000000C001400000000000, fp128 0xL00000000000000004000800000000000, ptr %quo) | ||
| ret fp128 %call | ||
| } | ||
|
|
||
| declare fp128 @remquol(fp128, fp128, ptr) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
| ; RUN: opt < %s -passes=instcombine -S | FileCheck %s | ||
|
|
||
| define x86_fp80 @remquo_fp80(ptr %quo) { | ||
| ; CHECK-LABEL: define x86_fp80 @remquo_fp80( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: store i32 -2, ptr [[QUO]], align 4 | ||
| ; CHECK-NEXT: ret x86_fp80 0xK3FFF8000000000000000 | ||
| ; | ||
| entry: | ||
| %call = call x86_fp80 @remquol(x86_fp80 0xKC001A000000000000000, x86_fp80 0xK4000C000000000000000, ptr %quo) | ||
| ret x86_fp80 %call | ||
| } | ||
|
|
||
| declare x86_fp80 @remquol(x86_fp80, x86_fp80, ptr) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
| ; RUN: opt < %s -passes=instcombine -S | FileCheck %s | ||
|
|
||
| define ppc_fp128 @remquo_ppc_fp128(ptr %quo) { | ||
| ; CHECK-LABEL: define ppc_fp128 @remquo_ppc_fp128( | ||
| ; CHECK-SAME: ptr [[QUO:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: store i32 -2, ptr [[QUO]], align 4 | ||
| ; CHECK-NEXT: ret ppc_fp128 0xM3FF00000000000000000000000000000 | ||
| ; | ||
| entry: | ||
| %call = call ppc_fp128 @remquol(ppc_fp128 0xMC0140000000000000000000000000000, ppc_fp128 0xM40080000000000000000000000000000, ptr %quo) | ||
| ret ppc_fp128 %call | ||
| } | ||
|
|
||
| declare ppc_fp128 @remquol(ppc_fp128, ppc_fp128, ptr) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,175 @@ | ||
| // DEFINE: %{compile} = mlir-opt %s \ | ||
| // DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \ | ||
| // DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \ | ||
| // DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t | ||
| // DEFINE: %{entry_point} = reduce_1d_f32 | ||
| // DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\ | ||
| // DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | ||
|
|
||
| // RUN: %{compile} | ||
|
|
||
| // RUN: %{run} | FileCheck %s --check-prefix=REDUCE-F32 | ||
|
|
||
| // REDEFINE: %{entry_point} = reduce_1d_i32 | ||
| // RUN: %{run} | FileCheck %s --check-prefix=REDUCE-I32 | ||
|
|
||
| // REDEFINE: %{entry_point} = generic_reduce_1d_f32 | ||
| // RUN: %{run} | FileCheck %s --check-prefix=GENERIC-F32 | ||
|
|
||
| func.func @reduce_1d_f32() { | ||
| // 1-D Tensor | ||
| %N = arith.constant 1000 : index | ||
| %c0_f32 = arith.constant 0.0 : f32 | ||
|
|
||
| // Allocate the input and output tensors | ||
| %A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32> | ||
| %C_alloc = bufferization.alloc_tensor() : tensor<f32> | ||
|
|
||
| // Initialise the tensors | ||
| %pi = arith.constant 3.1416 : f32 | ||
| %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32> | ||
| %C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32> | ||
|
|
||
| // Reduce | ||
| %C_out = linalg.reduce ins(%A_in : tensor<?xf32>) outs(%C_in: tensor<f32>) dimensions = [0] | ||
| (%in: f32, %init: f32) { | ||
| %0 = arith.addf %in, %init : f32 | ||
| linalg.yield %0 : f32 | ||
| } | ||
|
|
||
| // Print and verify the output | ||
| // REDUCE-F32-LABEL: SVE: START OF TEST OUTPUT | ||
| vector.print str "SVE: START OF TEST OUTPUT\n" | ||
|
|
||
| // REDUCE-F32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = | ||
| // REDUCE-F32-NEXT: [3141.6] | ||
|
|
||
| %xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32> | ||
| call @printMemrefF32(%xf) : (tensor<*xf32>) -> () | ||
|
|
||
| // REDUCE-F32-NEXT: SVE: END OF TEST OUTPUT | ||
| vector.print str "SVE: END OF TEST OUTPUT\n" | ||
|
|
||
| return | ||
| } | ||
|
|
||
| func.func @reduce_1d_i32() { | ||
| // 1-D Tensor | ||
| %N = arith.constant 1000 : index | ||
| %c0_i32 = arith.constant 0 : i32 | ||
|
|
||
| // Allocate the input and output tensors | ||
| %A_alloc = bufferization.alloc_tensor(%N) : tensor<?xi32> | ||
| %C_alloc = bufferization.alloc_tensor() : tensor<i32> | ||
|
|
||
| // Initialise the tensors | ||
| %pi = arith.constant 3 : i32 | ||
| %A_in = linalg.fill ins(%pi : i32) outs(%A_alloc : tensor<?xi32>) -> tensor<?xi32> | ||
| %C_in = tensor.insert %c0_i32 into %C_alloc[] : tensor<i32> | ||
|
|
||
| // Reduce | ||
| %C_out = linalg.reduce ins(%A_in : tensor<?xi32>) outs(%C_in: tensor<i32>) dimensions = [0] | ||
| (%in: i32, %init: i32) { | ||
| %0 = arith.addi %in, %init : i32 | ||
| linalg.yield %0 : i32 | ||
| } | ||
|
|
||
| // Print and verify the output | ||
| // REDUCE-I32-LABEL: SVE: START OF TEST OUTPUT | ||
| vector.print str "SVE: START OF TEST OUTPUT\n" | ||
|
|
||
| // REDUCE-I32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = | ||
| // REDUCE-I32-NEXT: [3000] | ||
|
|
||
| %xf = tensor.cast %C_out : tensor<i32> to tensor<*xi32> | ||
| call @printMemrefI32(%xf) : (tensor<*xi32>) -> () | ||
|
|
||
| // REDUCE-I32-NEXT: SVE: END OF TEST OUTPUT | ||
| vector.print str "SVE: END OF TEST OUTPUT\n" | ||
|
|
||
| return | ||
| } | ||
|
|
||
| func.func @generic_reduce_1d_f32() { | ||
| // 1-D Tensor | ||
| %N = arith.constant 1000 : index | ||
| %c0_f32 = arith.constant 0.0 : f32 | ||
|
|
||
| // Allocate the input and output tensors | ||
| %A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32> | ||
| %C_alloc = bufferization.alloc_tensor() : tensor<f32> | ||
|
|
||
| // Initialise the tensors | ||
| %pi = arith.constant 3.1416 : f32 | ||
| %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32> | ||
| %C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32> | ||
|
|
||
| // Reduce | ||
| %C_out = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>, | ||
| affine_map<(d0) -> ()>], | ||
| iterator_types = ["reduction"] } | ||
| ins(%A_in : tensor<?xf32>) | ||
| outs(%C_in : tensor<f32>) { | ||
| ^bb(%in: f32, %out: f32) : | ||
| %0 = arith.addf %in, %out : f32 | ||
| linalg.yield %0 : f32 | ||
| } -> tensor<f32> | ||
|
|
||
| // Print and verify the output | ||
| // GENERIC-F32-LABEL: SVE: START OF TEST OUTPUT | ||
| vector.print str "SVE: START OF TEST OUTPUT\n" | ||
|
|
||
| // GENERIC-F32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = | ||
| // GENERIC-F32-NEXT: [3141.6] | ||
|
|
||
| %xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32> | ||
| call @printMemrefF32(%xf) : (tensor<*xf32>) -> () | ||
|
|
||
| // GENERIC-F32-NEXT: SVE: END OF TEST OUTPUT | ||
| vector.print str "SVE: END OF TEST OUTPUT\n" | ||
|
|
||
| return | ||
| } | ||
|
|
||
| module attributes {transform.with_named_sequence} { | ||
| // A sequence that will tile and vectorise a Reduce Op | ||
| transform.named_sequence @tile_and_vectorize_reduce(%func | ||
| : !transform.op<"func.func"> {transform.readonly}) { | ||
|
|
||
| // Step 0: Get a handle to the reduce Op | ||
| %reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func | ||
| : (!transform.op<"func.func">) -> !transform.any_op | ||
|
|
||
| // Step 1: Tile | ||
| %tiled_reduce, %loops:1 = transform.structured.tile_using_for %reduce tile_sizes [[4]] | ||
| : (!transform.any_op) -> (!transform.any_op, !transform.any_op) | ||
|
|
||
| // Step 2: Vectorize | ||
| transform.structured.vectorize %tiled_reduce vector_sizes [[4]] : !transform.any_op | ||
|
|
||
| // Step 3: Lower vector.multi_reduction | ||
| transform.apply_patterns to %func { | ||
| transform.apply_patterns.vector.lower_masked_transfers | ||
| transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction" | ||
| } : !transform.op<"func.func"> | ||
|
|
||
| transform.yield | ||
| } | ||
|
|
||
| // A sequence that goes over all functions in tis module and applies | ||
| // "tile_and_vectorize_reduce" | ||
| transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) { | ||
| %funcs = transform.structured.match ops{["func.func"]} in %module | ||
| : (!transform.any_op) -> !transform.op<"func.func"> | ||
|
|
||
| transform.foreach %funcs : !transform.op<"func.func"> { | ||
| ^bb2(%func : !transform.op<"func.func">): | ||
| transform.include @tile_and_vectorize_reduce failures(propagate) | ||
| (%func) : (!transform.op<"func.func">) -> () | ||
| } | ||
| transform.yield | ||
| } | ||
| } | ||
|
|
||
| func.func private @printMemrefF32(%ptr : tensor<*xf32>) | ||
| func.func private @printMemrefI32(%ptr : tensor<*xi32>) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,180 @@ | ||
| // DEFINE: %{compile} = mlir-opt %s \ | ||
| // DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \ | ||
| // DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \ | ||
| // DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t | ||
| // DEFINE: %{entry_point} = reduce_2d_f32 | ||
| // DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\ | ||
| // DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | ||
|
|
||
| // RUN: %{compile} | ||
|
|
||
| // RUN: %{run} | FileCheck %s --check-prefix=REDUCE | ||
|
|
||
| // REDEFINE: %{entry_point} = generic_reduce_2d_f32 | ||
| // RUN: %{run} | FileCheck %s --check-prefix=GENERIC | ||
|
|
||
| func.func @reduce_2d_f32() { | ||
| // 2-D Tensor | ||
| %M = arith.constant 16 : index | ||
| %N = arith.constant 1000 : index | ||
| %c0_f32 = arith.constant 0.0 : f32 | ||
|
|
||
| // Allocate the input and output tensors | ||
| %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32> | ||
| %C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32> | ||
|
|
||
| // Initialise the tensors | ||
| %pi = arith.constant 3.1416 : f32 | ||
| %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32> | ||
| %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32> | ||
|
|
||
| // Reduce | ||
| %C_out = linalg.reduce ins(%A_in : tensor<?x?xf32>) outs(%C_in: tensor<?xf32>) dimensions = [1] | ||
| (%in: f32, %init: f32) { | ||
| %0 = arith.addf %in, %init : f32 | ||
| linalg.yield %0 : f32 | ||
| } | ||
|
|
||
| // Print and verify the output | ||
| // REDUCE-LABEL: SVE: START OF TEST OUTPUT | ||
| vector.print str "SVE: START OF TEST OUTPUT\n" | ||
|
|
||
| // REDUCE-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data = | ||
| // REDUCE-NEXT: [3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6] | ||
|
|
||
| %xf = tensor.cast %C_out : tensor<?xf32> to tensor<*xf32> | ||
| call @printMemrefF32(%xf) : (tensor<*xf32>) -> () | ||
|
|
||
| // REDUCE-NEXT: SVE: END OF TEST OUTPUT | ||
| vector.print str "SVE: END OF TEST OUTPUT\n" | ||
|
|
||
| return | ||
| } | ||
|
|
||
| func.func @generic_reduce_2d_f32() { | ||
| // 2-D Tensor | ||
| %M = arith.constant 16 : index | ||
| %N = arith.constant 1000 : index | ||
| %c0_f32 = arith.constant 0.0 : f32 | ||
|
|
||
| // Allocate the input and output tensors | ||
| %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32> | ||
| %C_alloc = bufferization.alloc_tensor(%M) : tensor<?xf32> | ||
|
|
||
| // Initialise the tensors | ||
| %pi = arith.constant 3.1416 : f32 | ||
| %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32> | ||
| %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?xf32>) -> tensor<?xf32> | ||
|
|
||
| // Reduce | ||
| %C_out = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, | ||
| affine_map<(d0, d1) -> (d0)>], | ||
| iterator_types = ["parallel", "reduction"] } | ||
| ins(%A_in : tensor<?x?xf32>) | ||
| outs(%C_in : tensor<?xf32>) { | ||
| ^bb(%in: f32, %out: f32) : | ||
| %0 = arith.addf %in, %out : f32 | ||
| linalg.yield %0 : f32 | ||
| } -> tensor<?xf32> | ||
|
|
||
| // Print and verify the output | ||
| // GENERIC-LABEL: SVE: START OF TEST OUTPUT | ||
| vector.print str "SVE: START OF TEST OUTPUT\n" | ||
|
|
||
| // GENERIC-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data = | ||
| // GENERIC-NEXT: [3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6] | ||
|
|
||
| %xf = tensor.cast %C_out : tensor<?xf32> to tensor<*xf32> | ||
| call @printMemrefF32(%xf) : (tensor<*xf32>) -> () | ||
|
|
||
| // GENERIC-NEXT: SVE: END OF TEST OUTPUT | ||
| vector.print str "SVE: END OF TEST OUTPUT\n" | ||
|
|
||
| return | ||
| } | ||
|
|
||
| func.func @generic_reduce_2d_i32() { | ||
| // 2-D Tensor | ||
| %M = arith.constant 16 : index | ||
| %N = arith.constant 1000 : index | ||
| %c0_i32 = arith.constant 0 : i32 | ||
|
|
||
| // Allocate the input and output tensors | ||
| %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xi32> | ||
| %C_alloc = bufferization.alloc_tensor(%M) : tensor<?xi32> | ||
|
|
||
| // Initialise the tensors | ||
| %pi = arith.constant 3 : i32 | ||
| %A_in = linalg.fill ins(%pi : i32) outs(%A_alloc : tensor<?x?xi32>) -> tensor<?x?xi32> | ||
| %C_in = linalg.fill ins(%c0_i32 : i32) outs(%C_alloc : tensor<?xi32>) -> tensor<?xi32> | ||
|
|
||
| // Reduce | ||
| %C_out = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, | ||
| affine_map<(d0, d1) -> (d0)>], | ||
| iterator_types = ["parallel", "reduction"] } | ||
| ins(%A_in : tensor<?x?xi32>) | ||
| outs(%C_in : tensor<?xi32>) { | ||
| ^bb(%in: i32, %out: i32) : | ||
| %0 = arith.addi %in, %out : i32 | ||
| linalg.yield %0 : i32 | ||
| } -> tensor<?xi32> | ||
|
|
||
| // Print and verify the output | ||
| // GENERIC-I32-LABEL: SVE: START OF TEST OUTPUT | ||
| vector.print str "SVE: START OF TEST OUTPUT\n" | ||
|
|
||
| // GENERIC-I32-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data = | ||
| // GENERIC-I32-NEXT: [3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000] | ||
|
|
||
| %xf = tensor.cast %C_out : tensor<?xi32> to tensor<*xi32> | ||
| call @printMemrefI32(%xf) : (tensor<*xi32>) -> () | ||
|
|
||
| // GENERIC-I32-NEXT: SVE: END OF TEST OUTPUT | ||
| vector.print str "SVE: END OF TEST OUTPUT\n" | ||
|
|
||
| return | ||
| } | ||
|
|
||
|
|
||
| module attributes {transform.with_named_sequence} { | ||
| // A sequence that will tile and vectorise a Reduce Op | ||
| transform.named_sequence @tile_and_vectorize_reduce(%func | ||
| : !transform.op<"func.func"> {transform.readonly}) { | ||
|
|
||
| // Step 0: Get a handle to the reduce Op | ||
| %reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func | ||
| : (!transform.op<"func.func">) -> !transform.any_op | ||
|
|
||
| // Step 1: Tile | ||
| %tiled_reduce, %loops:2 = transform.structured.tile_using_for %reduce tile_sizes [1, [4]] | ||
| : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) | ||
|
|
||
| // Step 2: Vectorize | ||
| transform.structured.vectorize %tiled_reduce vector_sizes [1, [4]] : !transform.any_op | ||
|
|
||
| // Step 3: Lower vector.multi_reduction | ||
| transform.apply_patterns to %func { | ||
| transform.apply_patterns.vector.lower_masked_transfers | ||
| transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction" | ||
| } : !transform.op<"func.func"> | ||
|
|
||
| transform.yield | ||
| } | ||
|
|
||
| // A sequence that goes over all functions in tis module and applies | ||
| // "tile_and_vectorize_reduce" | ||
| transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) { | ||
| %funcs = transform.structured.match ops{["func.func"]} in %module | ||
| : (!transform.any_op) -> !transform.op<"func.func"> | ||
|
|
||
| transform.foreach %funcs : !transform.op<"func.func"> { | ||
| ^bb2(%func : !transform.op<"func.func">): | ||
| transform.include @tile_and_vectorize_reduce failures(propagate) | ||
| (%func) : (!transform.op<"func.func">) -> () | ||
| } | ||
| transform.yield | ||
| } | ||
| } | ||
|
|
||
| func.func private @printMemrefF32(%ptr : tensor<*xf32>) | ||
| func.func private @printMemrefI32(%ptr : tensor<*xi32>) |