Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mlir] affine-loop-invariant-code-motion and affine-pipeline-data-transfer Incorrect behavior #59838

Closed
lipracer opened this issue Jan 5, 2023 · 4 comments

Comments

@lipracer
Copy link
Member

lipracer commented Jan 5, 2023

#map = affine_map<(d0)[s0, s1] -> (d0 * 163840 + s0 * 20480 + s1 * 320)>
#set = affine_set<() : (-1 >= 0)>

func.func private @cluster_id() -> index
func.func private @core_id() -> index
func.func @main(%arg0: memref<10485760xi32>, %arg1: memref<1xi32>, %arg2: memref<10485760xi32>) {
  %c10485760 = arith.constant 10485760 : index
  %c320 = arith.constant 320 : index
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  %0 = memref.alloc() {alignment = 16 : i64} : memref<0xi32, 2>
  %1 = memref.alloc() : memref<1xi32, 2>
  %3 = call @core_id() : () -> index
  %4 = call @cluster_id() : () -> index
  affine.for %arg3 = 0 to 64 {
    %5 = affine.apply #map(%arg3)[%4, %3]
    %6 = memref.alloc() {alignment = 16 : i64} : memref<0xi32, 2>
    %7 = memref.alloc() : memref<320xi32, 2>
    affine.dma_start %arg0[%5], %7[%c0], %6[%c0], %c320 : memref<10485760xi32>, memref<320xi32, 2>, memref<0xi32, 2>
    affine.dma_start %arg1[%c0], %1[%c0], %0[%c0], %c1 : memref<1xi32>, memref<1xi32, 2>, memref<0xi32, 2>
    affine.dma_wait %6[%c0], %c320 : memref<0xi32, 2>
    affine.dma_wait %0[%c0], %c1 : memref<0xi32, 2>
    %8 = affine.apply #map(%arg3)[%4, %3]
    %9 = memref.alloc() {alignment = 16 : i64} : memref<0xi32, 2>
    %10 = memref.alloc() : memref<320xi32, 2>
    affine.for %arg4 = 0 to 320 {
      %11 = affine.load %7[%arg4] : memref<320xi32, 2>
      %2 = affine.load %1[0] : memref<1xi32, 2>
      %12 = arith.addi %11, %2 : i32
      %13 = arith.addi %12, %11 : i32
      affine.store %13, %10[%arg4] : memref<320xi32, 2>
    }
    affine.dma_start %10[%c0], %arg2[%8], %9[%c0], %c320 : memref<320xi32, 2>, memref<10485760xi32>, memref<0xi32, 2>
    affine.dma_wait %9[%c0], %c320 : memref<0xi32, 2>
  }
  return
}

when I run command with:

./bin/mlir-opt  -affine-pipeline-data-transfer -mlir-print-ir-after-all test_pipeline.mlir

I got:

#map = affine_map<(d0)[s0, s1] -> (d0 * 163840 + s0 * 20480 + s1 * 320)>
#map1 = affine_map<(d0) -> (d0 - 1)>
#map2 = affine_map<(d0) -> (d0 mod 2)>
module {
  func.func private @cluster_id() -> index
  func.func private @core_id() -> index
  func.func @main(%arg0: memref<10485760xi32>, %arg1: memref<1xi32>, %arg2: memref<10485760xi32>) {
    %c64 = arith.constant 64 : index
    %c0 = arith.constant 0 : index
    %c10485760 = arith.constant 10485760 : index
    %c320 = arith.constant 320 : index
    %c0_0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %0 = call @core_id() : () -> index
    %1 = call @cluster_id() : () -> index
    %alloc = memref.alloc() : memref<2x320xi32, 2>
    %alloc_1 = memref.alloc() : memref<2x1xi32, 2>
    %alloc_2 = memref.alloc() : memref<2x0xi32, 2>
    %alloc_3 = memref.alloc() : memref<2x0xi32, 2>
    %2 = affine.apply #map(%c0)[%1, %0]
    affine.dma_start %arg0[%2], %alloc[%c0 mod 2, 0], %alloc_2[%c0 mod 2, 0], %c320 : memref<10485760xi32>, memref<2x320xi32, 2>, memref<2x0xi32, 2>
    affine.dma_start %arg1[%c0_0], %alloc_1[%c0 mod 2, 0], %alloc_3[%c0 mod 2, 0], %c1 : memref<1xi32>, memref<2x1xi32, 2>, memref<2x0xi32, 2>
    affine.for %arg3 = 1 to 64 {
      %9 = affine.apply #map(%arg3)[%1, %0]
      affine.dma_start %arg0[%9], %alloc[%arg3 mod 2, 0], %alloc_2[%arg3 mod 2, 0], %c320 : memref<10485760xi32>, memref<2x320xi32, 2>, memref<2x0xi32, 2>
      affine.dma_start %arg1[%c0_0], %alloc_1[%arg3 mod 2, 0], %alloc_3[%arg3 mod 2, 0], %c1 : memref<1xi32>, memref<2x1xi32, 2>, memref<2x0xi32, 2>
      %10 = affine.apply #map1(%arg3)
      %11 = affine.apply #map2(%10)
      %12 = affine.apply #map2(%10)
      %13 = affine.apply #map2(%10)
      %14 = affine.apply #map2(%10)
      affine.dma_wait %alloc_2[%10 mod 2, 0], %c320 : memref<2x0xi32, 2>
      affine.dma_wait %alloc_3[%10 mod 2, 0], %c1 : memref<2x0xi32, 2>
      %15 = affine.apply #map(%10)[%1, %0]
      %alloc_6 = memref.alloc() {alignment = 16 : i64} : memref<0xi32, 2>
      %alloc_7 = memref.alloc() : memref<320xi32, 2>
      affine.for %arg4 = 0 to 320 {
        %16 = affine.load %alloc[%10 mod 2, %arg4] : memref<2x320xi32, 2>
        %17 = affine.load %alloc_1[%10 mod 2, 0] : memref<2x1xi32, 2>
        %18 = arith.addi %16, %17 : i32
        %19 = arith.addi %18, %16 : i32
        affine.store %19, %alloc_7[%arg4] : memref<320xi32, 2>
      }
      affine.dma_start %alloc_7[%c0_0], %arg2[%15], %alloc_6[%c0_0], %c320 : memref<320xi32, 2>, memref<10485760xi32>, memref<0xi32, 2>
      affine.dma_wait %alloc_6[%c0_0], %c320 : memref<0xi32, 2>
    }
    %3 = affine.apply #map1(%c64)
    %4 = affine.apply #map2(%3)
    %5 = affine.apply #map2(%3)
    %6 = affine.apply #map2(%3)
    %7 = affine.apply #map2(%3)
    affine.dma_wait %alloc_2[%3 mod 2, 0], %c320 : memref<2x0xi32, 2>
    affine.dma_wait %alloc_3[%3 mod 2, 0], %c1 : memref<2x0xi32, 2>
    %8 = affine.apply #map(%3)[%1, %0]
    %alloc_4 = memref.alloc() {alignment = 16 : i64} : memref<0xi32, 2>
    %alloc_5 = memref.alloc() : memref<320xi32, 2>
    affine.for %arg3 = 0 to 320 {
      %9 = affine.load %alloc[%3 mod 2, %arg3] : memref<2x320xi32, 2>
      %10 = affine.load %alloc_1[%3 mod 2, 0] : memref<2x1xi32, 2>
      %11 = arith.addi %9, %10 : i32
      %12 = arith.addi %11, %9 : i32
      affine.store %12, %alloc_5[%arg3] : memref<320xi32, 2>
    }
    affine.dma_start %alloc_5[%c0_0], %arg2[%8], %alloc_4[%c0_0], %c320 : memref<320xi32, 2>, memref<10485760xi32>, memref<0xi32, 2>
    affine.dma_wait %alloc_4[%c0_0], %c320 : memref<0xi32, 2>
    memref.dealloc %alloc_3 : memref<2x0xi32, 2>
    memref.dealloc %alloc_2 : memref<2x0xi32, 2>
    memref.dealloc %alloc_1 : memref<2x1xi32, 2>
    memref.dealloc %alloc : memref<2x320xi32, 2>
    return
  }
}

Obviously, scalar types should not be double buffered. I think it is because there is no loop invariant promotion.

    affine.dma_start %arg1[%c0_0], %alloc_1[%c0 mod 2, 0], %alloc_3[%c0 mod 2, 0], %c1 : memref<1xi32>, memref<2x1xi32, 2>, memref<2x0xi32, 2>

but when I run with:

./bin/mlir-opt -affine-loop-invariant-code-motion -affine-pipeline-data-transfer -mlir-print-ir-after-all test_pipeline.mlir

I got:

#map = affine_map<(d0)[s0, s1] -> (d0 * 163840 + s0 * 20480 + s1 * 320)>
#map1 = affine_map<(d0) -> (d0 - 1)>
#map2 = affine_map<(d0) -> (d0 mod 2)>
module {
  func.func private @cluster_id() -> index
  func.func private @core_id() -> index
  func.func @main(%arg0: memref<10485760xi32>, %arg1: memref<1xi32>, %arg2: memref<10485760xi32>) {
    %c64 = arith.constant 64 : index
    %c0 = arith.constant 0 : index
    %c10485760 = arith.constant 10485760 : index
    %c320 = arith.constant 320 : index
    %c0_0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %alloc = memref.alloc() {alignment = 16 : i64} : memref<0xi32, 2>
    %alloc_1 = memref.alloc() : memref<1xi32, 2>
    %0 = call @core_id() : () -> index
    %1 = call @cluster_id() : () -> index
    %2 = affine.load %alloc_1[0] : memref<1xi32, 2>
    %alloc_2 = memref.alloc() : memref<2x320xi32, 2>
    %alloc_3 = memref.alloc() : memref<2x0xi32, 2>
    %3 = affine.apply #map(%c0)[%1, %0]
    affine.dma_start %arg0[%3], %alloc_2[%c0 mod 2, 0], %alloc_3[%c0 mod 2, 0], %c320 : memref<10485760xi32>, memref<2x320xi32, 2>, memref<2x0xi32, 2>
    affine.for %arg3 = 1 to 64 {
      %8 = affine.apply #map(%arg3)[%1, %0]
      affine.dma_start %arg0[%8], %alloc_2[%arg3 mod 2, 0], %alloc_3[%arg3 mod 2, 0], %c320 : memref<10485760xi32>, memref<2x320xi32, 2>, memref<2x0xi32, 2>
      %9 = affine.apply #map1(%arg3)
      %10 = affine.apply #map2(%9)
      %11 = affine.apply #map2(%9)
      affine.dma_start %arg1[%c0_0], %alloc_1[%c0_0], %alloc[%c0_0], %c1 : memref<1xi32>, memref<1xi32, 2>, memref<0xi32, 2>
      affine.dma_wait %alloc_3[%9 mod 2, 0], %c320 : memref<2x0xi32, 2>
      affine.dma_wait %alloc[%c0_0], %c1 : memref<0xi32, 2>
      %12 = affine.apply #map(%9)[%1, %0]
      %alloc_6 = memref.alloc() {alignment = 16 : i64} : memref<0xi32, 2>
      %alloc_7 = memref.alloc() : memref<320xi32, 2>
      affine.for %arg4 = 0 to 320 {
        %13 = affine.load %alloc_2[%9 mod 2, %arg4] : memref<2x320xi32, 2>
        %14 = arith.addi %13, %2 : i32
        %15 = arith.addi %14, %13 : i32
        affine.store %15, %alloc_7[%arg4] : memref<320xi32, 2>
      }
      affine.dma_start %alloc_7[%c0_0], %arg2[%12], %alloc_6[%c0_0], %c320 : memref<320xi32, 2>, memref<10485760xi32>, memref<0xi32, 2>
      affine.dma_wait %alloc_6[%c0_0], %c320 : memref<0xi32, 2>
    }
    %4 = affine.apply #map1(%c64)
    %5 = affine.apply #map2(%4)
    %6 = affine.apply #map2(%4)
    affine.dma_start %arg1[%c0_0], %alloc_1[%c0_0], %alloc[%c0_0], %c1 : memref<1xi32>, memref<1xi32, 2>, memref<0xi32, 2>
    affine.dma_wait %alloc_3[%4 mod 2, 0], %c320 : memref<2x0xi32, 2>
    affine.dma_wait %alloc[%c0_0], %c1 : memref<0xi32, 2>
    %7 = affine.apply #map(%4)[%1, %0]
    %alloc_4 = memref.alloc() {alignment = 16 : i64} : memref<0xi32, 2>
    %alloc_5 = memref.alloc() : memref<320xi32, 2>
    affine.for %arg3 = 0 to 320 {
      %8 = affine.load %alloc_2[%4 mod 2, %arg3] : memref<2x320xi32, 2>
      %9 = arith.addi %8, %2 : i32
      %10 = arith.addi %9, %8 : i32
      affine.store %10, %alloc_5[%arg3] : memref<320xi32, 2>
    }
    affine.dma_start %alloc_5[%c0_0], %arg2[%7], %alloc_4[%c0_0], %c320 : memref<320xi32, 2>, memref<10485760xi32>, memref<0xi32, 2>
    affine.dma_wait %alloc_4[%c0_0], %c320 : memref<0xi32, 2>
    memref.dealloc %alloc_3 : memref<2x0xi32, 2>
    memref.dealloc %alloc_2 : memref<2x320xi32, 2>
    return
  }
}

Afine.load appears before in the dma_start, this is obviously incorrect. When I find the source code and find that the processing here is marked as FIXME, can we do a simple memory effects analysis to handle this problem? If so, I will submit a patch.

@llvmbot
Copy link
Collaborator

llvmbot commented Jan 5, 2023

@llvm/issue-subscribers-mlir-affine

@lipracer
Copy link
Member Author

lipracer commented Jan 6, 2023

I has submit a patch.
I found the common LoopInvariantCodeMotion pass. Should AffineLoopInvariantCodeMotion can safe remove?
But neither pass can handle the operation of sideeffect and alias analysis.

@bondhugula
Copy link
Contributor

If -licm subsumes -affine-licm, the latter can be removed.

@lipracer
Copy link
Member Author

If -licm subsumes -affine-licm, the latter can be removed.

I will test this pass -licm, then remove -affine-licm later.

@lipracer lipracer closed this as completed Feb 4, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants