Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU] Extra buffer created for tensor.unpack when second dim is not distributed #16868

Open
dcaballe opened this issue Mar 21, 2024 · 0 comments
Assignees
Labels
codegen/llvm LLVM code generation compiler backend codegen Shared code generation infrastructure and dialects

Comments

@dcaballe
Copy link
Contributor

I'm hitting a compilation error when changing the way we distribute tensor.pack. It looks like if we only distribute one of the dims (e.g., [32, 0] distribution tile sizes), we end up generating an extra buffer and compilation crashes due to stack allocation.

I managed to reduce the issue to the following code before ConvertToDestinationPassingStyle:

#config = #iree_codegen.lowering_config<tile_sizes = [[32, 0], [8, 4]]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
#map = affine_map<()[s0] -> (s0 ceildiv 8)>
#map1 = affine_map<()[s0] -> (s0 ceildiv 4)>
#map2 = affine_map<()[s0] -> (s0 ceildiv 32)>
#map3 = affine_map<()[s0] -> (s0 * 32)>
#map4 = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
#map5 = affine_map<(d0) -> (d0 ceildiv 8)>
#map6 = affine_map<()[s0] -> ((s0 - 1) floordiv 4 + 1)>
#map7 = affine_map<(d0) -> (d0 floordiv 8)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 6, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#pipeline_layout1 = #hal.pipeline.layout<push_constants = 8, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#pipeline_layout2 = #hal.pipeline.layout<push_constants = 10, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#pipeline_layout3 = #hal.pipeline.layout<push_constants = 16, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#pipeline_layout4 = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#pipeline_layout5 = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDataTiling>
#translation1 = #iree_codegen.translation_info<Mmt4dTilingExpert>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_dispatch_4 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_dispatch_4_unpack_f32 ordinal(0) layout(#pipeline_layout2) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>], translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map2()[%arg3]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module {
        func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_dispatch_4_unpack_f32() {
          %c4 = arith.constant 4 : index
          %c32_i64 = arith.constant 32 : i64
          %c0 = arith.constant 0 : index
          %0 = hal.interface.constant.load[0] : i32
          %1 = hal.interface.constant.load[1] : i32
          %2 = hal.interface.constant.load[2] : i32
          %3 = hal.interface.constant.load[3] : i32
          %4 = hal.interface.constant.load[4] : i32
          %5 = hal.interface.constant.load[5] : i32
          %6 = hal.interface.constant.load[6] : i32
          %7 = hal.interface.constant.load[7] : i32
          %8 = hal.interface.constant.load[8] : i32
          %9 = hal.interface.constant.load[9] : i32
          %10 = arith.extui %0 : i32 to i64
          %11 = arith.extui %1 : i32 to i64
          %12 = arith.shli %11, %c32_i64 : i64
          %13 = arith.ori %10, %12 : i64
          %14 = arith.index_castui %13 : i64 to index
          %15 = arith.extui %2 : i32 to i64
          %16 = arith.extui %3 : i32 to i64
          %17 = arith.shli %16, %c32_i64 : i64
          %18 = arith.ori %15, %17 : i64
          %19 = arith.index_castui %18 : i64 to index
          %20 = arith.extui %4 : i32 to i64
          %21 = arith.extui %5 : i32 to i64
          %22 = arith.shli %21, %c32_i64 : i64
          %23 = arith.ori %20, %22 : i64
          %24 = arith.index_castui %23 : i64 to index
          %25 = arith.extui %6 : i32 to i64
          %26 = arith.extui %7 : i32 to i64
          %27 = arith.shli %26, %c32_i64 : i64
          %28 = arith.ori %25, %27 : i64
          %29 = arith.index_castui %28 : i64 to index
          %30 = arith.extui %8 : i32 to i64
          %31 = arith.extui %9 : i32 to i64
          %32 = arith.shli %31, %c32_i64 : i64
          %33 = arith.ori %30, %32 : i64
          %34 = arith.index_castui %33 : i64 to index
          %35 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%14) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x8x4xf32>>{%19, %24}
          %36 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%29, %34}
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %37 = affine.apply #map3()[%workgroup_id_x]
          %38 = affine.apply #map3()[%workgroup_count_x]
          scf.for %arg0 = %37 to %29 step %38 {
            %39 = affine.min #map4(%arg0)[%29]
            %40 = affine.apply #map5(%39)
            %41 = affine.apply #map6()[%34]
            %42 = arith.muli %41, %c4 : index
            %43 = affine.apply #map7(%arg0)
            %44 = flow.dispatch.tensor.load %35, offsets = [%43, 0, 0, 0], sizes = [%40, %41, 8, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4xf32>>{%19, %24} -> tensor<?x?x8x4xf32>
            %45 = bufferization.alloc_tensor(%39, %42) : tensor<?x?xf32>
            %unpack = tensor.unpack %44 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %45 {lowering_config = #config} : tensor<?x?x8x4xf32> -> tensor<?x?xf32>
            %extracted_slice = tensor.extract_slice %unpack[0, 0] [%39, %34] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
            %46 = arith.extui %6 : i32 to i64
            %47 = arith.extui %7 : i32 to i64
            %48 = arith.shli %47, %c32_i64 : i64
            %49 = arith.ori %46, %48 : i64
            %50 = arith.index_castui %49 : i64 to index
            %51 = arith.extui %8 : i32 to i64
            %52 = arith.extui %9 : i32 to i64
            %53 = arith.shli %52, %c32_i64 : i64
            %54 = arith.ori %51, %53 : i64
            %55 = arith.index_castui %54 : i64 to index
            flow.dispatch.tensor.store %extracted_slice, %36, offsets = [%arg0, 0], sizes = [%39, %34], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%50, %55}
          }
          return
        }
      }
    }
  }
}

and this is the code after ConvertToDestinationPassingStyle:

// iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-convert-to-destination-passing-style)))))" test.mlir

#config = #iree_codegen.lowering_config<tile_sizes = [[32, 0], [8, 4]]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
#map = affine_map<()[s0] -> (s0 ceildiv 32)>
#map1 = affine_map<()[s0] -> (s0 * 32)>
#map2 = affine_map<(d0)[s0] -> (-d0 + s0, 32)>
#map3 = affine_map<(d0) -> (d0 ceildiv 8)>
#map4 = affine_map<()[s0] -> ((s0 - 1) floordiv 4 + 1)>
#map5 = affine_map<(d0) -> (d0 floordiv 8)>
#pipeline_layout = #hal.pipeline.layout<push_constants = 10, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
#translation = #iree_codegen.translation_info<CPUDataTiling>
#device_target_llvm_cpu = #hal.device.target<"llvm-cpu", [#executable_target_embedded_elf_x86_64_]>
module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
  hal.executable private @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_dispatch_4 {
    hal.executable.variant public @embedded_elf_x86_64 target(#executable_target_embedded_elf_x86_64_) {
      hal.executable.export public @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_dispatch_4_unpack_f32 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>], translation_info = #translation} {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
        %c1 = arith.constant 1 : index
        %0 = affine.apply #map()[%arg3]
        hal.return %0, %c1, %c1 : index, index, index
      }
      builtin.module {
        func.func @matmul_accumulate_DYNxDYNxbf16_times_DYNxDYNxbf16_into_DYNxDYNxf32_dispatch_4_unpack_f32() {
          %c4 = arith.constant 4 : index
          %c32_i64 = arith.constant 32 : i64
          %c0 = arith.constant 0 : index
          %0 = hal.interface.constant.load[0] : i32
          %1 = hal.interface.constant.load[1] : i32
          %2 = hal.interface.constant.load[2] : i32
          %3 = hal.interface.constant.load[3] : i32
          %4 = hal.interface.constant.load[4] : i32
          %5 = hal.interface.constant.load[5] : i32
          %6 = hal.interface.constant.load[6] : i32
          %7 = hal.interface.constant.load[7] : i32
          %8 = hal.interface.constant.load[8] : i32
          %9 = hal.interface.constant.load[9] : i32
          %10 = arith.extui %0 : i32 to i64
          %11 = arith.extui %1 : i32 to i64
          %12 = arith.shli %11, %c32_i64 : i64
          %13 = arith.ori %10, %12 : i64
          %14 = arith.index_castui %13 : i64 to index
          %15 = arith.extui %2 : i32 to i64
          %16 = arith.extui %3 : i32 to i64
          %17 = arith.shli %16, %c32_i64 : i64
          %18 = arith.ori %15, %17 : i64
          %19 = arith.index_castui %18 : i64 to index
          %20 = arith.extui %4 : i32 to i64
          %21 = arith.extui %5 : i32 to i64
          %22 = arith.shli %21, %c32_i64 : i64
          %23 = arith.ori %20, %22 : i64
          %24 = arith.index_castui %23 : i64 to index
          %25 = arith.extui %6 : i32 to i64
          %26 = arith.extui %7 : i32 to i64
          %27 = arith.shli %26, %c32_i64 : i64
          %28 = arith.ori %25, %27 : i64
          %29 = arith.index_castui %28 : i64 to index
          %30 = arith.extui %8 : i32 to i64
          %31 = arith.extui %9 : i32 to i64
          %32 = arith.shli %31, %c32_i64 : i64
          %33 = arith.ori %30, %32 : i64
          %34 = arith.index_castui %33 : i64 to index
          %35 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%14) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?x8x4xf32>>{%19, %24}
          %36 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%29, %34}
          %workgroup_id_x = hal.interface.workgroup.id[0] : index
          %workgroup_count_x = hal.interface.workgroup.count[0] : index
          %37 = affine.apply #map1()[%workgroup_id_x]
          %38 = affine.apply #map1()[%workgroup_count_x]
          scf.for %arg0 = %37 to %29 step %38 {
            %39 = affine.min #map2(%arg0)[%29]
            %40 = affine.apply #map3(%39)
            %41 = affine.apply #map4()[%34]
            %42 = arith.muli %41, %c4 : index
            %43 = affine.apply #map5(%arg0)
            %44 = flow.dispatch.tensor.load %35, offsets = [%43, 0, 0, 0], sizes = [%40, %41, 8, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x8x4xf32>>{%19, %24} -> tensor<?x?x8x4xf32>
            %45 = bufferization.alloc_tensor(%39, %42) : tensor<?x?xf32>
            %unpack = tensor.unpack %44 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %45 {lowering_config = #config} : tensor<?x?x8x4xf32> -> tensor<?x?xf32>
            %extracted_slice = tensor.extract_slice %unpack[0, 0] [%39, %34] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
            %46 = arith.extui %6 : i32 to i64
            %47 = arith.extui %7 : i32 to i64
            %48 = arith.shli %47, %c32_i64 : i64
            %49 = arith.ori %46, %48 : i64
            %50 = arith.index_castui %49 : i64 to index
            %51 = arith.extui %8 : i32 to i64
            %52 = arith.extui %9 : i32 to i64
            %53 = arith.shli %52, %c32_i64 : i64
            %54 = arith.ori %51, %53 : i64
            %55 = arith.index_castui %54 : i64 to index
            flow.dispatch.tensor.store %extracted_slice, %36, offsets = [%arg0, 0], sizes = [%39, %34], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%50, %55}
          }
          return
        }
      }
    }
  }
}

It looks like we generate a buffer allocation (%45 = bufferization.alloc_tensor(%39, %42) : tensor<?x?xf32>) that leads to the extra stack allocation at the end of the pipeline.

CC: @hanhanW, @MaheshRavishankar

@dcaballe dcaballe added codegen Shared code generation infrastructure and dialects codegen/llvm LLVM code generation compiler backend labels Mar 21, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
codegen/llvm LLVM code generation compiler backend codegen Shared code generation infrastructure and dialects
Projects
None yet
Development

No branches or pull requests

2 participants