24 changes: 12 additions & 12 deletions mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ func.func @reduction_tile_parallel(
transform.sequence failures(propagate) {
^bb0(%arg1: !pdl.operation):
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
%loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0
%loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0
by num_threads = [0, 5], tile_sizes = []
}

Expand All @@ -127,7 +127,7 @@ transform.sequence failures(propagate) {
// CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
// CHECK: %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
// CHECK: %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
// CHECK: %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
// CHECK: %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
// CHECK-DAG: %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]]
// CHECK-DAG: %[[TS1:.+]] = affine.max #[[MAP1]](%[[TS0]])
// CHECK-DAG: %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?x5xf32> to tensor<?xf32>
Expand All @@ -139,7 +139,7 @@ transform.sequence failures(propagate) {
// CHECK: arith.addf
// CHECK: linalg.yield
// CHECK: } -> tensor<?xf32>
// CHECK: scf.foreach_thread.perform_concurrently {
// CHECK: scf.forall.in_parallel {
// CHECK: tensor.parallel_insert_slice %[[PARTIAL]] into %[[ARG3]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?xf32> into tensor<?x5xf32>
// CHECK: }
// CHECK: }
Expand All @@ -161,7 +161,7 @@ func.func @matmul_tile_parallel(
transform.sequence failures(propagate) {
^bb0(%arg1: !pdl.operation):
%0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
%loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0
%loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0
by num_threads = [0, 0, 5], tile_sizes = []
}

Expand All @@ -181,7 +181,7 @@ transform.sequence failures(propagate) {
// CHECK-DAG: %[[D4:.*]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
// CHECK: %[[E:.*]] = tensor.empty(%[[D3]], %[[D4]]) : tensor<?x?x5xf32>
// CHECK: %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x?x5xf32>) -> tensor<?x?x5xf32>
// CHECK: %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x?x5xf32>) {
// CHECK: %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x?x5xf32>) {
// CHECK-DAG: %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]]
// CHECK-DAG: %[[TS1:.+]] = affine.max #[[MAP1]](%[[TS0]])
// CHECK-DAG: %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, 0, %[[IV]]] [%[[D0]], %[[D2]], 1] [1, 1, 1] : tensor<?x?x5xf32> to tensor<?x?xf32>
Expand All @@ -190,7 +190,7 @@ transform.sequence failures(propagate) {
// CHECK: %[[INCHUNKB:.+]] = tensor.extract_slice %[[ARG1]][%[[TINDEX]], 0] [%[[TS1]], %[[D2]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
// CHECK: %[[TEMPEXT:.+]] = tensor.extract_slice %[[ET]][0, 0] [%[[D0]], %[[D2]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
// CHECK: %[[PARTIAL:.+]] = linalg.matmul ins(%[[INCHUNKA]], %[[INCHUNKB]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[TEMPEXT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
// CHECK: scf.foreach_thread.perform_concurrently {
// CHECK: scf.forall.in_parallel {
// CHECK: tensor.parallel_insert_slice %[[PARTIAL]] into %[[ARG3]][0, 0, %[[IV]]] [%[[D0]], %[[D2]], 1] [1, 1, 1] : tensor<?x?xf32> into tensor<?x?x5xf32>
// CHECK: }
// CHECK: }
Expand Down Expand Up @@ -220,7 +220,7 @@ func.func @reduction_tile_parallel_cyclic_dist(
transform.sequence failures(propagate) {
^bb0(%arg1: !pdl.operation):
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
%loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0
%loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0
by num_threads = [0, 5], tile_sizes = [0, 3], mapping = [#gpu.thread<x>]
}

Expand All @@ -238,7 +238,7 @@ transform.sequence failures(propagate) {
// CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
// CHECK: %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
// CHECK: %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
// CHECK: %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
// CHECK: %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
// CHECK: %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?x5xf32> to tensor<?xf32>
// CHECK: %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
// CHECK: %[[LB:.+]] = affine.apply #[[MAP0]]()[%[[IV]]]
Expand All @@ -255,7 +255,7 @@ transform.sequence failures(propagate) {
// CHECK: %[[INS:.+]] = tensor.insert_slice %[[PARTIAL]] into %[[ACC]][0] [%[[D3]]] [1] : tensor<?xf32> into tensor<?xf32>
// CHECK: scf.yield %[[INS]] : tensor<?xf32>
// CHECK: }
// CHECK: scf.foreach_thread.perform_concurrently {
// CHECK: scf.forall.in_parallel {
// CHECK: tensor.parallel_insert_slice %[[CARRY]] into %[[ARG3]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?xf32> into tensor<?x5xf32>
// CHECK: }
// CHECK: }
Expand Down Expand Up @@ -285,7 +285,7 @@ func.func @reduction_tile_parallel_cyclic_dist(
transform.sequence failures(propagate) {
^bb0(%arg1: !pdl.operation):
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
%loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0
%loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0
by num_threads = [0, 5], tile_sizes = [0, 3], mapping = [#gpu.thread<x>]

// CHECK: expecting fill
Expand All @@ -303,7 +303,7 @@ transform.sequence failures(propagate) {

// -----

func.func @reduction_untiled_foreach_thread(
func.func @reduction_untiled_forall(
%arg0: tensor<?x?xf32>, %out: tensor<?xf32>) -> tensor<?xf32> {
// expected-note @below {{target operation}}
%red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
Expand All @@ -323,7 +323,7 @@ transform.sequence failures(propagate) {
^bb0(%arg1: !pdl.operation):
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
// expected-error @below {{could not tile reduction}}
%loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0
%loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0
by num_threads = [5], tile_sizes = [3], mapping = [#gpu.thread<x>]

}
Expand Down
4 changes: 2 additions & 2 deletions mlir/test/Dialect/SCF/canonicalize.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -1486,8 +1486,8 @@ func.func @canonicalize_parallel_insert_slice_indices(
// CHECK: %[[c1:.*]] = arith.constant 1 : index
%c1 = arith.constant 1 : index

%2 = scf.foreach_thread (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
scf.foreach_thread.perform_concurrently {
%2 = scf.forall (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
scf.forall.in_parallel {
tensor.parallel_insert_slice %arg0 into %o[%tidx, 0] [1, 5] [1, 1] : tensor<1x5xf32> into tensor<?x?xf32>
}
}
Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ func.func @reduce() {
linalg.fill ins(%cst_0 : f32) outs(%0 : memref<128x384xf32>)
%2 = memref.alloc() : memref<128xf32>
linalg.fill ins(%cst_0 : f32) outs(%2 : memref<128xf32>)
scf.foreach_thread (%arg0) in (%c2) {
scf.forall (%arg0) in (%c2) {
%7 = affine.min affine_map<(d0) -> (d0 * -64 + 128, 64)>(%arg0)
%8 = affine.max affine_map<(d0) -> (0, d0)>(%7)
%9 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0)
Expand Down
16 changes: 8 additions & 8 deletions mlir/test/Dialect/SCF/invalid.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -548,9 +548,9 @@ func.func @wrong_num_results(%in: tensor<100xf32>, %out: tensor<100xf32>) {
%num_threads = arith.constant 100 : index

// expected-error @+1 {{1 operands present, but expected 2}}
%result:2 = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>, tensor<100xf32>) {
%result:2 = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>, tensor<100xf32>) {
%1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
tensor<1xf32> into tensor<100xf32>
}
Expand All @@ -564,9 +564,9 @@ func.func @invalid_insert_dest(%in: tensor<100xf32>, %out: tensor<100xf32>) {
%c1 = arith.constant 1 : index
%num_threads = arith.constant 100 : index

%result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
%result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
%1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
// expected-error @+1 {{may only insert into an output block argument}}
tensor.parallel_insert_slice %1 into %out[%thread_idx][1][1] :
tensor<1xf32> into tensor<100xf32>
Expand All @@ -581,10 +581,10 @@ func.func @wrong_terminator_op(%in: tensor<100xf32>, %out: tensor<100xf32>) {
%c1 = arith.constant 1 : index
%num_threads = arith.constant 100 : index

%result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
%result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
%1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
// expected-error @+1 {{expected only tensor.parallel_insert_slice ops}}
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
tensor<1xf32> into tensor<100xf32>
%0 = arith.constant 1: index
Expand All @@ -598,8 +598,8 @@ func.func @wrong_terminator_op(%in: tensor<100xf32>, %out: tensor<100xf32>) {
func.func @mismatched_mapping(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
%one = arith.constant 1 : index
%c65535 = arith.constant 65535 : index
// expected-error @below {{'scf.foreach_thread' op mapping attribute size must match op rank}}
scf.foreach_thread (%i, %j) in (%c65535, %c65535) {
// expected-error @below {{'scf.forall' op mapping attribute size must match op rank}}
scf.forall (%i, %j) in (%c65535, %c65535) {
%4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
%5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
%6 = math.fma %alpha, %4, %5 : f32
Expand Down
4 changes: 2 additions & 2 deletions mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,7 @@ func.func @same_enclosing_repetitive_region(%2: tensor<320xf32>,
%c0 = arith.constant 0 : index
%cst = arith.constant -0.000000e+00 : f32
%c320 = arith.constant 320 : index
%4 = scf.foreach_thread (%arg0) in (%c320) shared_outs(%arg1 = %2) -> (tensor<320xf32>) {
%4 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2) -> (tensor<320xf32>) {
// CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
%5 = tensor.extract_slice %3[%arg0, 0] [1, 10240] [1, 1] : tensor<320x10240xf32> to tensor<1x10240xf32>
// CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
Expand All @@ -624,7 +624,7 @@ func.func @same_enclosing_repetitive_region(%2: tensor<320xf32>,
// CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
%8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>

scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
// CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
tensor.parallel_insert_slice %8 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,23 +108,23 @@ func.func @scf_while_non_equiv_condition_and_body(%A: tensor<5xi1>,

// -----

// CHECK-LABEL: func @scf_foreach_thread_out_of_place(
// CHECK-LABEL: func @scf_forall_out_of_place(
// CHECK-SAME: %[[arg0:.*]]: tensor<100xf32>, %[[arg1:.*]]: tensor<100xf32>
// CHECK-FUNC-LABEL: func @scf_foreach_thread_out_of_place(
func.func @scf_foreach_thread_out_of_place(%in: tensor<100xf32>,
// CHECK-FUNC-LABEL: func @scf_forall_out_of_place(
func.func @scf_forall_out_of_place(%in: tensor<100xf32>,
%out: tensor<100xf32>) {
%c1 = arith.constant 1 : index
%num_threads = arith.constant 100 : index

// CHECK-FUNC-NOT: alloc_tensor
// CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() copy(%[[arg1]]) {bufferization.escape = [false]} : tensor<100xf32>
// CHECK: scf.foreach_thread {{.*}} shared_outs(%[[o:.*]] = %[[alloc]])
%result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
// CHECK: scf.forall {{.*}} shared_outs(%[[o:.*]] = %[[alloc]])
%result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
// CHECK: tensor.extract_slice
// CHECK: scf.foreach_thread.perform_concurrently
// CHECK: scf.forall.in_parallel
// CHECK: tensor.parallel_insert_slice %{{.*}} into %[[o]]
%1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
tensor<1xf32> into tensor<100xf32>
}
Expand Down
34 changes: 17 additions & 17 deletions mlir/test/Dialect/SCF/one-shot-bufferize.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -543,8 +543,8 @@ func.func @parallel_insert_slice_no_conflict(
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index

// CHECK: scf.foreach_thread (%[[tidx:.*]]) in (%[[idx2]])
%2 = scf.foreach_thread (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
// CHECK: scf.forall (%[[tidx:.*]]) in (%[[idx2]])
%2 = scf.forall (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
// CHECK: %[[subview:.*]] = memref.subview %[[arg2]][5] [%[[idx]]] [1]
%6 = tensor.extract_slice %o[5] [%idx] [%c1] : tensor<?xf32> to tensor<?xf32>
// CHECK: linalg.fill ins(%{{.*}}) outs(%[[subview]] : memref<?xf32
Expand All @@ -553,9 +553,9 @@ func.func @parallel_insert_slice_no_conflict(
// CHECK: memref.copy %[[subview]], %[[subview]]

// Empty terminator is elided from pretty-printing.
// CHECK-NOT: scf.foreach_thread.perform_concurrently
// CHECK-NOT: scf.forall.in_parallel
// CHECK-NOT: parallel_insert_slice
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %o[5] [%idx] [%c1] :
tensor<?xf32> into tensor<?xf32>
}
Expand Down Expand Up @@ -589,8 +589,8 @@ func.func @parallel_insert_slice_with_conflict(
// CHECK: %[[alloc1:.*]] = memref.alloc
// CHECK: memref.copy %[[arg2]], %[[alloc1]]

// CHECK: scf.foreach_thread (%[[tidx:.*]]) in (%[[idx2]])
%2 = scf.foreach_thread (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
// CHECK: scf.forall (%[[tidx:.*]]) in (%[[idx2]])
%2 = scf.forall (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
// CHECK: %[[subview1:.*]] = memref.subview %[[alloc1]][5] [%[[idx]]] [1]
%6 = tensor.extract_slice %o[5] [%idx] [%c1] : tensor<?xf32> to tensor<?xf32>

Expand All @@ -601,9 +601,9 @@ func.func @parallel_insert_slice_with_conflict(
// CHECK: memref.copy %[[subview1]], %[[subview1]]

// Empty terminator is elided from pretty-printing.
// CHECK-NOT: scf.foreach_thread.perform_concurrently
// CHECK-NOT: scf.forall.in_parallel
// CHECK-NOT: parallel_insert_slice
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %o[5] [%idx] [%c1] :
tensor<?xf32> into tensor<?xf32>
}
Expand All @@ -629,8 +629,8 @@ func.func @matmul(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>, %arg2: tensor<
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index

// CHECK: scf.foreach_thread {{.*}}
%0 = scf.foreach_thread (%arg3, %arg4) in (%c2, %c4) shared_outs(%o = %arg2) -> (tensor<8x8xf32>) {
// CHECK: scf.forall {{.*}}
%0 = scf.forall (%arg3, %arg4) in (%c2, %c4) shared_outs(%o = %arg2) -> (tensor<8x8xf32>) {
%1 = affine.apply #map0(%arg3)
%3 = tensor.extract_slice %arg0[%1, 0] [4, 8] [1, 1] : tensor<8x8xf32> to tensor<4x8xf32>
%4 = affine.apply #map1(%arg4)
Expand All @@ -639,7 +639,7 @@ func.func @matmul(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>, %arg2: tensor<

// CHECK: linalg.matmul ins({{.*}}memref<4x8xf32, strided<[?, ?], offset: ?>>, memref<8x4xf32, strided<[?, ?], offset: ?>>) outs({{.*}} : memref<4x4xf32, strided<[?, ?], offset: ?>>)
%8 = linalg.matmul ins(%3, %6 : tensor<4x8xf32>, tensor<8x4xf32>) outs(%7 : tensor<4x4xf32>) -> tensor<4x4xf32>
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
tensor.parallel_insert_slice %8 into %o[%1, %4] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<8x8xf32>
}
}
Expand All @@ -658,19 +658,19 @@ func.func @scf_foreach_private_var(%t: tensor<10xf32>) -> f32 {
// CHECK: %[[t_copy:.*]] = memref.alloc() {{.*}} : memref<10xf32>
// CHECK: memref.copy %[[t]], %[[t_copy]]

// CHECK: scf.foreach_thread (%{{.*}}) in (%{{.*}}) {
// CHECK: scf.forall (%{{.*}}) in (%{{.*}}) {

// Load from the copy and store into the shared output.
// CHECK: %[[subview:.*]] = memref.subview %[[t]]
// CHECK: memref.load %[[t_copy]]
// CHECK: memref.store %{{.*}}, %[[subview]]
%0 = scf.foreach_thread (%tid) in (%c2) shared_outs(%o = %t) -> tensor<10xf32> {
%0 = scf.forall (%tid) in (%c2) shared_outs(%o = %t) -> tensor<10xf32> {
%offset = arith.muli %c5, %tid : index
%slice = tensor.extract_slice %o[%offset] [5] [1]
: tensor<10xf32> to tensor<5xf32>
%r2 = tensor.extract %t[%tid] : tensor<10xf32>
%i = tensor.insert %r2 into %slice[%c2] : tensor<5xf32>
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
tensor.parallel_insert_slice %i into %o[%offset] [5] [1]
: tensor<5xf32> into tensor<10xf32>
}
Expand All @@ -691,8 +691,8 @@ func.func @scf_foreach_privatized_but_not_copied(

// CHECK-NOT: memref.alloc
// CHECK-NOT: memref.copy
// CHECK: scf.foreach_thread {{.*}} {
%0 = scf.foreach_thread (%tid) in (%c2) shared_outs(%o = %t0) -> tensor<10xf32> {
// CHECK: scf.forall {{.*}} {
%0 = scf.forall (%tid) in (%c2) shared_outs(%o = %t0) -> tensor<10xf32> {
%offset = arith.muli %c5, %tid : index
%slice = tensor.extract_slice %o[%offset] [5] [1]
: tensor<10xf32> to tensor<5xf32>
Expand All @@ -701,7 +701,7 @@ func.func @scf_foreach_privatized_but_not_copied(
// CHECK: memref.load %[[t1]]
%r2 = tensor.extract %t1[%tid] : tensor<10xf32>
%i = tensor.insert %r2 into %slice[%c2] : tensor<5xf32>
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
tensor.parallel_insert_slice %i into %o[%offset] [5] [1]
: tensor<5xf32> into tensor<10xf32>
}
Expand Down
44 changes: 22 additions & 22 deletions mlir/test/Dialect/SCF/ops.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -311,78 +311,78 @@ func.func @execute_region() -> i64 {
return %res : i64
}

// CHECK-LABEL: func.func @normalized_foreach_thread
func.func @normalized_foreach_thread(%in: tensor<100xf32>, %out: tensor<100xf32>) {
// CHECK-LABEL: func.func @normalized_forall
func.func @normalized_forall(%in: tensor<100xf32>, %out: tensor<100xf32>) {
%c1 = arith.constant 1 : index
%num_threads = arith.constant 100 : index

// CHECK: scf.foreach_thread
// CHECK: scf.forall
// CHECK-NEXT: tensor.extract_slice
// CHECK-NEXT: scf.foreach_thread.perform_concurrently
// CHECK-NEXT: scf.forall.in_parallel
// CHECK-NEXT: tensor.parallel_insert_slice
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: return
%result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
%result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
%1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
tensor<1xf32> into tensor<100xf32>
}
}
return
}

// CHECK-LABEL: func.func @explicit_loop_bounds_foreach_thread
func.func @explicit_loop_bounds_foreach_thread(%in: tensor<100xf32>,
// CHECK-LABEL: func.func @explicit_loop_bounds_forall
func.func @explicit_loop_bounds_forall(%in: tensor<100xf32>,
%out: tensor<100xf32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%num_threads = arith.constant 100 : index

// CHECK: scf.foreach_thread
// CHECK: scf.forall
// CHECK-NEXT: tensor.extract_slice
// CHECK-NEXT: scf.foreach_thread.perform_concurrently
// CHECK-NEXT: scf.forall.in_parallel
// CHECK-NEXT: tensor.parallel_insert_slice
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: return
%result = scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> tensor<100xf32> {
%result = scf.forall (%thread_idx) = (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> tensor<100xf32> {
%1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
tensor<1xf32> into tensor<100xf32>
}
}
return
}

// CHECK-LABEL: func.func @normalized_foreach_thread_elide_terminator
func.func @normalized_foreach_thread_elide_terminator() -> () {
// CHECK-LABEL: func.func @normalized_forall_elide_terminator
func.func @normalized_forall_elide_terminator() -> () {
%num_threads = arith.constant 100 : index

// CHECK: scf.foreach_thread
// CHECK: scf.forall
// CHECK-NEXT: } {mapping = [#gpu.thread<x>]}
// CHECK-NEXT: return
scf.foreach_thread (%thread_idx) in (%num_threads) {
scf.foreach_thread.perform_concurrently {
scf.forall (%thread_idx) in (%num_threads) {
scf.forall.in_parallel {
}
} {mapping = [#gpu.thread<x>]}
return

}

// CHECK-LABEL: func.func @explicit_loop_bounds_foreach_thread_elide_terminator
func.func @explicit_loop_bounds_foreach_thread_elide_terminator() -> () {
// CHECK-LABEL: func.func @explicit_loop_bounds_forall_elide_terminator
func.func @explicit_loop_bounds_forall_elide_terminator() -> () {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%num_threads = arith.constant 100 : index

// CHECK: scf.foreach_thread
// CHECK: scf.forall
// CHECK-NEXT: } {mapping = [#gpu.thread<x>]}
// CHECK-NEXT: return
scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) {
scf.foreach_thread.perform_concurrently {
scf.forall (%thread_idx) = (%c0) to (%num_threads) step (%c1) {
scf.forall.in_parallel {
}
} {mapping = [#gpu.thread<x>]}
return
Expand Down
16 changes: 8 additions & 8 deletions mlir/test/Dialect/Tensor/canonicalize.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -1531,12 +1531,12 @@ func.func @canonicalize_parallel_insert_slice_indices(
%c1 = arith.constant 1 : index

// CHECK-NOT: tensor.cast
// CHECK: scf.foreach_thread (%[[tidx:[0-9a-z]*]]) in (%[[num_threads]]) shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<?x?xf32>) {
// CHECK-NEXT: scf.foreach_thread.perform_concurrently {
// CHECK: scf.forall (%[[tidx:[0-9a-z]*]]) in (%[[num_threads]]) shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<?x?xf32>) {
// CHECK-NEXT: scf.forall.in_parallel {
// CHECK-NEXT: tensor.parallel_insert_slice %[[arg0]] into %[[o]][%[[tidx]], 0] [1, 5] [1, 1]
%2 = scf.foreach_thread (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
%2 = scf.forall (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
%3 = tensor.cast %arg0 : tensor<1x5xf32> to tensor<?x5xf32>
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
tensor.parallel_insert_slice %3 into %o[%tidx, %c0] [%c1, 5] [%c1, %c1] : tensor<?x5xf32> into tensor<?x?xf32>
}
}
Expand All @@ -1553,11 +1553,11 @@ func.func @dont_fold_parallel_insert_slice(
{
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
// CHECK: scf.foreach_thread () in () shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<1x5xf32>) {
// CHECK-NEXT: scf.foreach_thread.perform_concurrently {
// CHECK: scf.forall () in () shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<1x5xf32>) {
// CHECK-NEXT: scf.forall.in_parallel {
// CHECK-NEXT: tensor.parallel_insert_slice %[[arg0]] into %[[o]][0, 0] [1, 5] [1, 1] : tensor<1x5xf32> into tensor<1x5xf32>
%2 = scf.foreach_thread () in () shared_outs(%o = %arg1) -> (tensor<1x5xf32>) {
scf.foreach_thread.perform_concurrently {
%2 = scf.forall () in () shared_outs(%o = %arg1) -> (tensor<1x5xf32>) {
scf.forall.in_parallel {
tensor.parallel_insert_slice %arg0 into %o[%c0, %c0] [1, 5] [%c1, %c1] : tensor<1x5xf32> into tensor<1x5xf32>
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ func.func @extract_slice_static(%input: tensor<3x5x7x11xf32>) -> tensor<20x11xf3
// FOREACH-DAG: %[[c5:.+]] = arith.constant 5 : index
// FOREACH-DAG: %[[c7:.+]] = arith.constant 7 : index
// FOREACH-DAG: %[[init:.+]] = tensor.empty() : tensor<20x11xf32>
// FOREACH: %[[tile:.+]] = scf.foreach_thread (%[[iv:.+]]) in (20) shared_outs(%[[dest:.+]] = %[[init]])
// FOREACH: %[[tile:.+]] = scf.forall (%[[iv:.+]]) in (20) shared_outs(%[[dest:.+]] = %[[init]])
// FOREACH: %[[multiIndex:.+]]:3 = affine.delinearize_index %[[iv]] into (%[[c3]], %[[c5]], %[[c7]]
// FOREACH: %[[slice:.+]] = tensor.extract_slice %[[arg0]][%[[multiIndex]]#0, %[[multiIndex]]#1, %[[multiIndex]]#2, 0] [1, 1, 1, 11] [1, 1, 1, 1] :
// FOREACH: %[[sliceFlat:.+]] = tensor.collapse_shape %[[slice]] {{\[}}[0, 1, 2], [3]{{\]}} :
// FOREACH: perform_concurrently
// FOREACH: in_parallel
// FOREACH-NEXT: tensor.parallel_insert_slice %[[sliceFlat]] into %[[dest]][%[[iv]], 0] [1, 11] [1, 1] :
// FOREACH: return %[[tile]]

Expand Down Expand Up @@ -136,14 +136,14 @@ func.func @extract_slice_dynamic_multidim(%input: tensor<3x?x?x11x?xf32>, %offt0
// FOREACH-DAG: %[[d1:.+]] = tensor.dim %[[arg0]], %[[c1]] :
// FOREACH-DAG: %[[d2:.+]] = tensor.dim %[[arg0]], %[[c2]] :
// FOREACH-DAG: %[[d4:.+]] = tensor.dim %[[arg0]], %[[c4]] :
// FOREACH: %[[tile1:.+]] = scf.foreach_thread (%[[tid1:.+]], %[[tid2:.+]]) in (%[[sz1]], %[[sz2]]) shared_outs(%[[dest:.+]] = %[[init]])
// FOREACH: %[[tile1:.+]] = scf.forall (%[[tid1:.+]], %[[tid2:.+]]) in (%[[sz1]], %[[sz2]]) shared_outs(%[[dest:.+]] = %[[init]])
// FOREACH-DAG: %[[iv1:.+]] = affine.apply #[[map1]](%[[tid1]])[%[[lb1]]]
// FOREACH: %[[multiIndex1:.+]]:3 = affine.delinearize_index %[[iv1]] into (%[[c3]], %[[d1]], %[[d2]]) :
// FOREACH-DAG: %[[iv2:.+]] = affine.apply #[[map1]](%[[tid2]])[%[[lb2]]]
// FOREACH: %[[multiIndex2:.+]]:2 = affine.delinearize_index %[[iv2]] into (%[[c11]], %[[d4]]) :
// FOREACH: %[[slice:.+]] = tensor.extract_slice %[[arg0]][%[[multiIndex1]]#0, %[[multiIndex1]]#1, %[[multiIndex1]]#2, %[[multiIndex2]]#0, %[[multiIndex2]]#1] [1, 1, 1, 1, 1] [1, 1, 1, 1, 1] :
// FOREACH: %[[sliceFlat:.+]] = tensor.collapse_shape %[[slice]] {{\[}}[0, 1, 2], [3, 4]{{\]}} :
// FOREACH: perform_concurrently
// FOREACH: in_parallel
//FOREACH-NEXT: tensor.parallel_insert_slice %[[sliceFlat]] into %[[dest]][%[[tid1]], %[[tid2]]] [1, 1] [1, 1] :

// -----
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ func.func @insert_slice_rank_reducing_dynamic_shape(
func.func @parallel_insert_slice(%t0: tensor<1x2xf32>, %t1: tensor<f32>, %t2: tensor<1x1xf32>) -> tensor<1x2xf32> {
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%r = scf.foreach_thread (%arg2, %arg3) in (%c1, %c2) shared_outs(%arg4 = %t0) -> (tensor<1x2xf32>) {
%r = scf.forall (%arg2, %arg3) in (%c1, %c2) shared_outs(%arg4 = %t0) -> (tensor<1x2xf32>) {
%inserted_slice = tensor.insert_slice %t1 into %t2[0, 0] [1, 1] [1, 1] : tensor<f32> into tensor<1x1xf32>
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
tensor.parallel_insert_slice %inserted_slice into %arg4[%arg2, %arg3] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x2xf32>
}
}
Expand Down
4 changes: 2 additions & 2 deletions mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ func.func @rank_reducing_parallel_insert_of_collapse_shape(
-> tensor<?x?x?x?xf32> {
%0 = tensor.collapse_shape %t [[0, 1], [2], [3]]
: tensor<?x1x1x5xf32> into tensor<?x1x5xf32>
%1 = scf.foreach_thread (%iv) in (%thr) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
scf.foreach_thread.perform_concurrently {
%1 = scf.forall (%iv) in (%thr) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
scf.forall.in_parallel {
tensor.parallel_insert_slice %0 into %o[0, 0, 0, 0][%sz, 1, 1, 5][1, 1, 1, 1]
: tensor<?x1x5xf32> into tensor<?x?x?x?xf32>
}
Expand Down
6 changes: 3 additions & 3 deletions mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,10 @@ func.func @rank_reducing_parallel_insert_slice(%in: tensor<100xf32>, %out: tenso
%c1 = arith.constant 1 : index
%num_threads = arith.constant 100 : index

// CHECK: scf.foreach_thread {{.*}} {
%result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs (%o = %out) -> tensor<200x100xf32> {
// CHECK: scf.forall {{.*}} {
%result = scf.forall (%thread_idx) in (%num_threads) shared_outs (%o = %out) -> tensor<200x100xf32> {
%1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
scf.foreach_thread.perform_concurrently {
scf.forall.in_parallel {
// CHECK: memref.subview %{{.*}}[%{{.*}}] [1] [1] : memref<100xf32, strided<[?], offset: ?>> to memref<1xf32, strided<[?], offset: ?>>
// CHECK: memref.subview %{{.*}}[1, %{{.*}}] [1, 1] [1, 1] : memref<200x100xf32, strided<[?, ?], offset: ?>> to memref<1xf32, strided<[?], offset: ?>>
tensor.parallel_insert_slice %1 into %o[1, %thread_idx][1, 1][1, 1] :
Expand Down
8 changes: 4 additions & 4 deletions mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ struct TestTensorTransforms
Option<bool> useForeach{
*this, "use-foreach",
llvm::cl::desc(
"Use the scf.foreach_thread operation when generating loop nests for "
"Use the scf.forall operation when generating loop nests for "
"the extract_slice of collapse_shape pattern"),
llvm::cl::init(false)};

Expand Down Expand Up @@ -247,7 +247,7 @@ struct RewriteExtractSliceFromCollapseShapeUsingScfForeach
tensor::ExtractSliceFromCollapseHelper &helper,
PatternRewriter &rewriter) const override {
Location loc = op.getLoc();
auto foreachThreadOp = rewriter.create<scf::ForeachThreadOp>(
auto forallOp = rewriter.create<scf::ForallOp>(
loc, /*numThreads=*/getAsOpFoldResult(helper.getIterationSpaceSizes()),
/*outputs=*/dest,
/*mapping=*/std::nullopt,
Expand All @@ -263,12 +263,12 @@ struct RewriteExtractSliceFromCollapseShapeUsingScfForeach
auto [tile, insertParams] =
helper.emitLoopNestBody(nestedBuilder, loc, outputIvs);
// Insert the slice into the destination.
auto term = nestedBuilder.create<scf::PerformConcurrentlyOp>(loc);
auto term = nestedBuilder.create<scf::InParallelOp>(loc);
nestedBuilder.setInsertionPointToStart(term.getBody());
nestedBuilder.create<tensor::ParallelInsertSliceOp>(
loc, tile, outputArgs[0], insertParams);
});
rewriter.replaceOp(op, foreachThreadOp->getResult(0));
rewriter.replaceOp(op, forallOp->getResult(0));
return success();
}
};
Expand Down
4 changes: 2 additions & 2 deletions mlir/test/python/dialects/transform_structured_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,15 +188,15 @@ def testTileExplicitLoopTypeAll():
[], transform.AnyOpType.get())
types = [
transform.OperationType.get(x)
for x in ["scf.for", "scf.parallel", "scf.foreach_thread"]
for x in ["scf.for", "scf.parallel", "scf.forall"]
]
with InsertionPoint(sequence.body):
structured.TileOp(types, sequence.bodyTarget, sizes=[2, 3, 4])
transform.YieldOp()
# CHECK-LABEL: TEST: testTileExplicitLoopTypeAll
# CHECK: = transform.structured.tile
# CHECK-SAME : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">,
# CHECK-SAME: !transform.op<"scf.parallel">, !transform.op<"scf.foreach_thread">
# CHECK-SAME: !transform.op<"scf.parallel">, !transform.op<"scf.forall">

@run
def testVectorize():
Expand Down