mlir/Affine: sibling loop fusion pass missing opportunity when memrefs are allocated in function scope

The following MLIR file has two functions with the same loop body, one of which gets successfully fused:

```mlir
func.func @f1(%input : memref<10xf32>, %output : memref<10xf32>, %reduc : memref<10xf32>) {
  %zero = arith.constant 0. : f32
  %one = arith.constant 1. : f32
  affine.for %i = 0 to 10 {
    %0 = affine.load %input[%i] : memref<10xf32>
    %2 = arith.addf %0, %one : f32
    affine.store %2, %output[%i] : memref<10xf32>
  }
  affine.for %i = 0 to 10 {
    %0 = affine.load %input[%i] : memref<10xf32>
    %1 = affine.load %reduc[0] : memref<10xf32>
    %2 = arith.addf %0, %1 : f32
    affine.store %2, %reduc[0] : memref<10xf32>
  }
  return
}

func.func @f2() {
  %input = memref.alloc() : memref<10xf32>
  %output = memref.alloc() : memref<10xf32>
  %reduc = memref.alloc() : memref<10xf32>
  %zero = arith.constant 0. : f32
  %one = arith.constant 1. : f32
  affine.for %i = 0 to 10 {
    %0 = affine.load %input[%i] : memref<10xf32>
    %2 = arith.addf %0, %one : f32
    affine.store %2, %output[%i] : memref<10xf32>
  }
  affine.for %i = 0 to 10 {
    %0 = affine.load %input[%i] : memref<10xf32>
    %1 = affine.load %reduc[0] : memref<10xf32>
    %2 = arith.addf %0, %1 : f32
    affine.store %2, %reduc[0] : memref<10xf32>
  }
  return
}
```

Running with
```shell
./bin/mlir-opt ../testing-2.mlir -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=sibling}))'
```
yields:

```mlir
module {
  func.func @f1(%arg0: memref<10xf32>, %arg1: memref<10xf32>, %arg2: memref<10xf32>) {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.000000e+00 : f32
    affine.for %arg3 = 0 to 10 {
      %0 = affine.load %arg0[%arg3] : memref<10xf32>
      %1 = arith.addf %0, %cst_0 : f32
      affine.store %1, %arg1[%arg3] : memref<10xf32>
      %2 = affine.load %arg0[%arg3] : memref<10xf32>
      %3 = affine.load %arg2[0] : memref<10xf32>
      %4 = arith.addf %2, %3 : f32
      affine.store %4, %arg2[0] : memref<10xf32>
    }
    return
  }
  func.func @f2() {
    %alloc = memref.alloc() : memref<10xf32>
    %alloc_0 = memref.alloc() : memref<10xf32>
    %alloc_1 = memref.alloc() : memref<10xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %cst_2 = arith.constant 1.000000e+00 : f32
    affine.for %arg0 = 0 to 10 {
      %0 = affine.load %alloc[%arg0] : memref<10xf32>
      %1 = arith.addf %0, %cst_2 : f32
      affine.store %1, %alloc_0[%arg0] : memref<10xf32>
    }
    affine.for %arg0 = 0 to 10 {
      %0 = affine.load %alloc[%arg0] : memref<10xf32>
      %1 = affine.load %alloc_1[0] : memref<10xf32>
      %2 = arith.addf %0, %1 : f32
      affine.store %2, %alloc_1[0] : memref<10xf32>
    }
    return
  }
}
```

When the `memref`s involved are allocated as part of the function instead of arguments, the pass fails. There's no output from adding `-debug-only=affine-loop-fusion,loop-fusion-utils` that I see to further understand why the sibling fusion pass fails in this case.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!