Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU][SVE] Failed to legalize multi-dim constant #16784

Open
dcaballe opened this issue Mar 14, 2024 · 0 comments
Open

[CPU][SVE] Failed to legalize multi-dim constant #16784

dcaballe opened this issue Mar 14, 2024 · 0 comments
Assignees
Labels
codegen/llvm LLVM code generation compiler backend codegen Shared code generation infrastructure and dialects

Comments

@dcaballe
Copy link
Contributor

Error:

<unknown>:0: error: failed to legalize operation 'arith.constant' that was explicitly marked illegal
<unknown>:0: note: see current operation: %2 = "arith.constant"() <{value = dense<0.000000e+00> : vector<[16]x8xf32>}> : () -> vector<[16]x8xf32>

Repro:

iree-compile --iree-hal-target-backends=llvm-cpu --iree-input-type=stablehlo --iree-llvmcpu-target-cpu-features=+sve --iree-llvmcpu-link-embedded=false --iree-opt-data-tiling=false --iree-llvmcpu-enable-ukernels=none --iree-llvmcpu-enable-scalable-vectorization=true test.mlir -o test.vmfb
hal.executable public @test {
  hal.executable.variant public @system_elf_arm_64 target(<"llvm-cpu", "system-elf-arm_64", {cpu = "", cpu_features = "+v9a,+fullfp16,+fp-armv8,+neon,+aes,+sha2,+crc,+lse,+rdm,+complxnum,+rcpc,+sha3,+sm4,+dotprod,+fp16fml,+dit,+flagm,+ssbs,+sb,+sve2-aes,+sve2-bitperm,+sve2-sha3,+sve2-sm4,+altnzcv,+fptoint,+bf16,+i8mm,+bti,+mte,+pauth,+perfmon,+predres,+spe,+ras,+sve,+sve2,+reserve-x18", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", debug_symbols = false, link_embedded = false, native_vector_size = 16 : i64, target_triple = "aarch64-none-linux-android34", ukernels = "none"}>) {
    hal.executable.export public @test ordinal(0) layout(#hal.pipeline.layout<push_constants = 5, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @test() {
        %c0_i32 = arith.constant 0 : i32
        %0 = hal.interface.constant.load[0] : i32
        %1 = hal.interface.constant.load[1] : i32
        %2 = hal.interface.constant.load[2] : i32
        %3 = hal.interface.constant.load[3] : i32
        %4 = hal.interface.constant.load[4] : i32
        %5 = arith.index_castui %0 : i32 to index
        %6 = arith.index_castui %1 : i32 to index
        %7 = arith.index_castui %2 : i32 to index
        %8 = arith.index_castui %3 : i32 to index
        %9 = arith.index_castui %4 : i32 to index
        %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128x256xi8>>
        %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xi8>>
        %12 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<128xf32>>
        %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256xf32>>
        %14 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%9) : !flow.dispatch.tensor<writeonly:tensor<128x256xf32>>
        %15 = flow.dispatch.tensor.load %10, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi8>> -> tensor<128x256xi8>
        %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xi8>> -> tensor<256x256xi8>
        %17 = flow.dispatch.tensor.load %12, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor<readonly:tensor<128xf32>> -> tensor<128xf32>
        %18 = flow.dispatch.tensor.load %13, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor<readonly:tensor<256xf32>> -> tensor<256xf32>
        %19 = tensor.empty() : tensor<128x256xf32>
        %20 = tensor.empty() : tensor<128x256xi32>
        %21 = linalg.fill ins(%c0_i32 : i32) outs(%20 : tensor<128x256xi32>) -> tensor<128x256xi32>
        %22 = linalg.matmul ins(%15, %16 : tensor<128x256xi8>, tensor<256x256xi8>) outs(%21 : tensor<128x256xi32>) -> tensor<128x256xi32>
        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%22, %17, %18 : tensor<128x256xi32>, tensor<128xf32>, tensor<256xf32>) outs(%19 : tensor<128x256xf32>) {
        ^bb0(%in: i32, %in_0: f32, %in_1: f32, %out: f32):
          %24 = arith.sitofp %in : i32 to f32
          %25 = arith.mulf %24, %in_0 : f32
          %26 = arith.mulf %25, %in_1 : f32
          linalg.yield %26 : f32
        } -> tensor<128x256xf32>
        flow.dispatch.tensor.store %23, %14, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : tensor<128x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<128x256xf32>>
        return
      }
    }
  }
}
@dcaballe dcaballe added codegen Shared code generation infrastructure and dialects codegen/llvm LLVM code generation compiler backend labels Mar 14, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
codegen/llvm LLVM code generation compiler backend codegen Shared code generation infrastructure and dialects
Projects
None yet
Development

No branches or pull requests

2 participants