54 changes: 54 additions & 0 deletions llvm/test/CodeGen/PowerPC/pr74951.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -verify-machineinstrs -ppc-asm-full-reg-names -mtriple=powerpc64-ibm-aix-xcoff | FileCheck %s

%struct.anon = type { i32 }

@b = local_unnamed_addr global %struct.anon { i32 -1 }, align 4
@g = local_unnamed_addr global [1 x i1] zeroinitializer, align 1

define noundef signext i32 @main() {
; CHECK-LABEL: main:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ld r3, L..C0(r2) # @b
; CHECK-NEXT: lwz r3, 0(r3)
; CHECK-NEXT: extsw r4, r3
; CHECK-NEXT: neg r4, r4
; CHECK-NEXT: andi. r5, r3, 65535
; CHECK-NEXT: rldicl r4, r4, 1, 63
; CHECK-NEXT: bne cr0, L..BB0_4
; CHECK-NEXT: # %bb.1: # %lor.rhs.i.i
; CHECK-NEXT: xori r5, r4, 1
; CHECK-NEXT: cmpw r3, r5
; CHECK-NEXT: crnot 4*cr5+lt, eq
; CHECK-NEXT: li r3, 1
; CHECK-NEXT: bc 12, 4*cr5+lt, L..BB0_3
; CHECK-NEXT: # %bb.2: # %lor.rhs.i.i
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: L..BB0_3: # %lor.rhs.i.i
; CHECK-NEXT: ld r5, L..C1(r2) # @g
; CHECK-NEXT: stb r3, 0(r5)
; CHECK-NEXT: L..BB0_4: # %g.exit
; CHECK-NEXT: ld r5, L..C1(r2) # @g
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: stb r4, 0(r5)
; CHECK-NEXT: blr
entry:
%0 = load i32, ptr @b, align 4
%conv4.i = sext i32 %0 to i64
%cmp.i = icmp slt i32 %0, 1
%conv.i = zext i1 %cmp.i to i32
%cmp1.i = icmp ne i32 %0, %conv.i
%conv3.i = trunc i32 %0 to i16
%tobool.not.i.i = icmp eq i16 %conv3.i, 0
br i1 %tobool.not.i.i, label %lor.rhs.i.i, label %g.exit

lor.rhs.i.i: ; preds = %entry
store i1 %cmp1.i, ptr @g, align 1
br label %g.exit

g.exit: ; preds = %lor.end.i.i
%4 = trunc i64 %conv4.i to i32
%cmp.i9.i = icmp sgt i32 %4, 0
store i1 %cmp.i9.i, ptr @g, align 1
ret i32 0
}
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/RISCV/forced-atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3567,8 +3567,8 @@ define i64 @rmw64_umax_seq_cst(ptr %p) nounwind {
; RV32-NEXT: # in Loop: Header=BB51_2 Depth=1
; RV32-NEXT: neg a3, a0
; RV32-NEXT: and a3, a3, a1
; RV32-NEXT: sw a1, 4(sp)
; RV32-NEXT: sw a4, 0(sp)
; RV32-NEXT: sw a1, 4(sp)
; RV32-NEXT: mv a1, sp
; RV32-NEXT: li a4, 5
; RV32-NEXT: li a5, 5
Expand Down
60 changes: 32 additions & 28 deletions llvm/test/CodeGen/RISCV/fpclamptosat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1324,8 +1324,8 @@ define i64 @ustest_f64i64(double %x) {
; RV32IF-NEXT: # %bb.4: # %entry
; RV32IF-NEXT: li a0, 1
; RV32IF-NEXT: .LBB20_5: # %entry
; RV32IF-NEXT: lw a3, 8(sp)
; RV32IF-NEXT: lw a4, 12(sp)
; RV32IF-NEXT: lw a4, 8(sp)
; RV32IF-NEXT: lw a3, 12(sp)
; RV32IF-NEXT: and a5, a2, a1
; RV32IF-NEXT: beqz a5, .LBB20_7
; RV32IF-NEXT: # %bb.6: # %entry
Expand All @@ -1334,17 +1334,18 @@ define i64 @ustest_f64i64(double %x) {
; RV32IF-NEXT: .LBB20_7:
; RV32IF-NEXT: snez a1, a0
; RV32IF-NEXT: .LBB20_8: # %entry
; RV32IF-NEXT: and a4, a2, a4
; RV32IF-NEXT: and a3, a2, a3
; RV32IF-NEXT: or a0, a0, a5
; RV32IF-NEXT: and a2, a2, a3
; RV32IF-NEXT: and a2, a2, a4
; RV32IF-NEXT: bnez a0, .LBB20_10
; RV32IF-NEXT: # %bb.9:
; RV32IF-NEXT: or a0, a2, a4
; RV32IF-NEXT: snez a1, a0
; RV32IF-NEXT: snez a0, a3
; RV32IF-NEXT: snez a1, a2
; RV32IF-NEXT: or a1, a1, a0
; RV32IF-NEXT: .LBB20_10: # %entry
; RV32IF-NEXT: neg a1, a1
; RV32IF-NEXT: and a0, a1, a2
; RV32IF-NEXT: and a1, a1, a4
; RV32IF-NEXT: and a1, a1, a3
; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IF-NEXT: addi sp, sp, 32
; RV32IF-NEXT: ret
Expand Down Expand Up @@ -1403,8 +1404,8 @@ define i64 @ustest_f64i64(double %x) {
; RV32IFD-NEXT: # %bb.4: # %entry
; RV32IFD-NEXT: li a0, 1
; RV32IFD-NEXT: .LBB20_5: # %entry
; RV32IFD-NEXT: lw a3, 8(sp)
; RV32IFD-NEXT: lw a4, 12(sp)
; RV32IFD-NEXT: lw a4, 8(sp)
; RV32IFD-NEXT: lw a3, 12(sp)
; RV32IFD-NEXT: and a5, a2, a1
; RV32IFD-NEXT: beqz a5, .LBB20_7
; RV32IFD-NEXT: # %bb.6: # %entry
Expand All @@ -1413,17 +1414,18 @@ define i64 @ustest_f64i64(double %x) {
; RV32IFD-NEXT: .LBB20_7:
; RV32IFD-NEXT: snez a1, a0
; RV32IFD-NEXT: .LBB20_8: # %entry
; RV32IFD-NEXT: and a4, a2, a4
; RV32IFD-NEXT: and a3, a2, a3
; RV32IFD-NEXT: or a0, a0, a5
; RV32IFD-NEXT: and a2, a2, a3
; RV32IFD-NEXT: and a2, a2, a4
; RV32IFD-NEXT: bnez a0, .LBB20_10
; RV32IFD-NEXT: # %bb.9:
; RV32IFD-NEXT: or a0, a2, a4
; RV32IFD-NEXT: snez a1, a0
; RV32IFD-NEXT: snez a0, a3
; RV32IFD-NEXT: snez a1, a2
; RV32IFD-NEXT: or a1, a1, a0
; RV32IFD-NEXT: .LBB20_10: # %entry
; RV32IFD-NEXT: neg a1, a1
; RV32IFD-NEXT: and a0, a1, a2
; RV32IFD-NEXT: and a1, a1, a4
; RV32IFD-NEXT: and a1, a1, a3
; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32IFD-NEXT: addi sp, sp, 32
; RV32IFD-NEXT: ret
Expand Down Expand Up @@ -1594,8 +1596,8 @@ define i64 @ustest_f32i64(float %x) {
; RV32-NEXT: # %bb.4: # %entry
; RV32-NEXT: li a0, 1
; RV32-NEXT: .LBB23_5: # %entry
; RV32-NEXT: lw a3, 8(sp)
; RV32-NEXT: lw a4, 12(sp)
; RV32-NEXT: lw a4, 8(sp)
; RV32-NEXT: lw a3, 12(sp)
; RV32-NEXT: and a5, a2, a1
; RV32-NEXT: beqz a5, .LBB23_7
; RV32-NEXT: # %bb.6: # %entry
Expand All @@ -1604,17 +1606,18 @@ define i64 @ustest_f32i64(float %x) {
; RV32-NEXT: .LBB23_7:
; RV32-NEXT: snez a1, a0
; RV32-NEXT: .LBB23_8: # %entry
; RV32-NEXT: and a4, a2, a4
; RV32-NEXT: and a3, a2, a3
; RV32-NEXT: or a0, a0, a5
; RV32-NEXT: and a2, a2, a3
; RV32-NEXT: and a2, a2, a4
; RV32-NEXT: bnez a0, .LBB23_10
; RV32-NEXT: # %bb.9:
; RV32-NEXT: or a0, a2, a4
; RV32-NEXT: snez a1, a0
; RV32-NEXT: snez a0, a3
; RV32-NEXT: snez a1, a2
; RV32-NEXT: or a1, a1, a0
; RV32-NEXT: .LBB23_10: # %entry
; RV32-NEXT: neg a1, a1
; RV32-NEXT: and a0, a1, a2
; RV32-NEXT: and a1, a1, a4
; RV32-NEXT: and a1, a1, a3
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
Expand Down Expand Up @@ -1847,8 +1850,8 @@ define i64 @ustest_f16i64(half %x) {
; RV32-NEXT: # %bb.4: # %entry
; RV32-NEXT: li a0, 1
; RV32-NEXT: .LBB26_5: # %entry
; RV32-NEXT: lw a3, 8(sp)
; RV32-NEXT: lw a4, 12(sp)
; RV32-NEXT: lw a4, 8(sp)
; RV32-NEXT: lw a3, 12(sp)
; RV32-NEXT: and a5, a2, a1
; RV32-NEXT: beqz a5, .LBB26_7
; RV32-NEXT: # %bb.6: # %entry
Expand All @@ -1857,17 +1860,18 @@ define i64 @ustest_f16i64(half %x) {
; RV32-NEXT: .LBB26_7:
; RV32-NEXT: snez a1, a0
; RV32-NEXT: .LBB26_8: # %entry
; RV32-NEXT: and a4, a2, a4
; RV32-NEXT: and a3, a2, a3
; RV32-NEXT: or a0, a0, a5
; RV32-NEXT: and a2, a2, a3
; RV32-NEXT: and a2, a2, a4
; RV32-NEXT: bnez a0, .LBB26_10
; RV32-NEXT: # %bb.9:
; RV32-NEXT: or a0, a2, a4
; RV32-NEXT: snez a1, a0
; RV32-NEXT: snez a0, a3
; RV32-NEXT: snez a1, a2
; RV32-NEXT: or a1, a1, a0
; RV32-NEXT: .LBB26_10: # %entry
; RV32-NEXT: neg a1, a1
; RV32-NEXT: and a0, a1, a2
; RV32-NEXT: and a1, a1, a4
; RV32-NEXT: and a1, a1, a3
; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/DebugInfo/dwarfdump-debug-frame-simple.test
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
; FRAMES-NEXT: DW_CFA_nop:

; FRAMES: 00000014 00000010 00000000 FDE cie=00000000 pc=00000000...00000022
; FRAMES: DW_CFA_advance_loc: 3
; FRAMES: DW_CFA_advance_loc: 3 to 0x3
; FRAMES-NEXT: DW_CFA_def_cfa_offset: +12
; FRAMES-NEXT: DW_CFA_nop:

; FRAMES: 00000028 00000014 00000000 FDE cie=00000000 pc=00000030...00000080
; FRAMES: DW_CFA_advance_loc: 1
; FRAMES: DW_CFA_advance_loc: 1 to 0x31
; FRAMES-NEXT: DW_CFA_def_cfa_offset: +8
; FRAMES-NEXT: DW_CFA_offset: {{reg5|EBP}} -8
; FRAMES-NEXT: DW_CFA_advance_loc: 2
; FRAMES-NEXT: DW_CFA_advance_loc: 2 to 0x33
; FRAMES-NEXT: DW_CFA_def_cfa_register: {{reg5|EBP}}

; FRAMES-NOT: CIE
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Instrumentation/AddressSanitizer/aarch64be.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
; RUN: opt < %s -passes=asan -S -mtriple=aarch64_be-linux-gnu | FileCheck --check-prefix=CHECK-AARCH64BE %s
; REQUIRES: aarch64-registered-target

define i32 @read_4_bytes(i32* %a) sanitize_address {
define i32 @read_4_bytes(ptr %a) sanitize_address {
entry:
%tmp1 = load i32, i32* %a, align 4
%tmp1 = load i32, ptr %a, align 4
ret i32 %tmp1
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ target datalayout = "P1"

define i1 @b(i64 %c) addrspace(1) {
%cast = inttoptr i64 %c to ptr addrspace(42)
%cmp = icmp ugt ptr addrspace(42) %cast, getelementptr inbounds ([1 x i32], ptr addrspace(42) @a, i64 0, i64 0)
%cmp = icmp ugt ptr addrspace(42) %cast, @a
ret i1 %cmp
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@

target triple = "x86_64-unknown-linux-gnu"

declare void @llvm.instrprof.increment.step(i8*, i64, i32, i32, i64)
declare void @llvm.instrprof.increment.step(ptr, i64, i32, i32, i64)

declare void @llvm.instrprof.value.profile(i8*, i64, i64, i32, i32)
declare void @llvm.instrprof.value.profile(ptr, i64, i64, i32, i32)

; CHECK: @__profd_foo = private global
@__profn_foo = private constant [3 x i8] c"foo"

define i32 @foo(i32 ()* ) {
%2 = ptrtoint i32 ()* %0 to i64
call void @llvm.instrprof.value.profile(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__profn_foo, i32 0, i32 0), i64 0, i64 %2, i32 0, i32 0)
call void @llvm.instrprof.increment.step(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__profn_foo, i32 0, i32 0), i64 0, i32 1, i32 0, i64 0)
define i32 @foo(ptr ) {
%2 = ptrtoint ptr %0 to i64
call void @llvm.instrprof.value.profile(ptr @__profn_foo, i64 0, i64 %2, i32 0, i32 0)
call void @llvm.instrprof.increment.step(ptr @__profn_foo, i64 0, i32 1, i32 0, i64 0)
%3 = tail call i32 %0()
ret i32 %3
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ target triple = "aarch64-unknown-linux-gnu"
; CHECK: @__profc_foo = private global [9 x i8] c"\FF\FF\FF\FF\FF\FF\FF\FF\FF", section "__llvm_prf_cnts", comdat, align 8

define void @_Z3foov() {
call void @llvm.instrprof.timestamp(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__profn_foo, i32 0, i32 0), i64 12345678, i32 9, i32 0)
call void @llvm.instrprof.timestamp(ptr @__profn_foo, i64 12345678, i32 9, i32 0)
; CHECK: call void @__llvm_profile_set_timestamp(ptr @__profc_foo)
call void @llvm.instrprof.cover(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__profn_foo, i32 0, i32 0), i64 12345678, i32 9, i32 8)
call void @llvm.instrprof.cover(ptr @__profn_foo, i64 12345678, i32 9, i32 8)
ret void
}

declare void @llvm.instrprof.timestamp(i8*, i64, i32, i32)
declare void @llvm.instrprof.cover(i8*, i64, i32, i32)
declare void @llvm.instrprof.timestamp(ptr, i64, i32, i32)
declare void @llvm.instrprof.cover(ptr, i64, i32, i32)
8 changes: 4 additions & 4 deletions llvm/test/Instrumentation/InstrProfiling/timestamp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ target triple = "aarch64-unknown-linux-gnu"
; CHECK: @__profc_foo = private global [2 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8

define void @_Z3foov() {
call void @llvm.instrprof.timestamp(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__profn_foo, i32 0, i32 0), i64 12345678, i32 2, i32 0)
call void @llvm.instrprof.timestamp(ptr @__profn_foo, i64 12345678, i32 2, i32 0)
; CHECK: call void @__llvm_profile_set_timestamp(ptr @__profc_foo)
call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__profn_foo, i32 0, i32 0), i64 12345678, i32 2, i32 1)
call void @llvm.instrprof.increment(ptr @__profn_foo, i64 12345678, i32 2, i32 1)
ret void
}

declare void @llvm.instrprof.timestamp(i8*, i64, i32, i32)
declare void @llvm.instrprof.increment(i8*, i64, i32, i32)
declare void @llvm.instrprof.timestamp(ptr, i64, i32, i32)
declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
10 changes: 5 additions & 5 deletions llvm/test/Object/Inputs/small.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ target triple = "i386-pc-windows"

define i32 @main() nounwind {
entry:
%call = tail call i32 @puts(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0)) nounwind
tail call void bitcast (void (...)* @SomeOtherFunction to void ()*)() nounwind
%call = tail call i32 @puts(ptr @.str) nounwind
tail call void @SomeOtherFunction() nounwind
ret i32 0
}

declare i32 @puts(i8* nocapture) nounwind
declare i32 @puts(ptr nocapture) nounwind

declare void @SomeOtherFunction(...)

@var = global i32 0
@llvm.used = appending global [1 x i8*] [i8* bitcast (i32* @var to i8*)], section "llvm.metadata"
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* null, i8* null }]
@llvm.used = appending global [1 x ptr] [ptr @var], section "llvm.metadata"
@llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr null, ptr null }]
10 changes: 5 additions & 5 deletions llvm/test/Object/Inputs/trivial.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@

define i32 @main() nounwind {
entry:
%call = tail call i32 @puts(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i32 0, i32 0)) nounwind
tail call void bitcast (void (...)* @SomeOtherFunction to void ()*)() nounwind
%call = tail call i32 @puts(ptr @.str) nounwind
tail call void @SomeOtherFunction() nounwind
ret i32 0
}

declare i32 @puts(i8* nocapture) nounwind
declare i32 @puts(ptr nocapture) nounwind

declare void @SomeOtherFunction(...)

@var = global i32 0
@llvm.used = appending global [1 x i8*] [i8* bitcast (i32* @var to i8*)], section "llvm.metadata"
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* null, i8* null }]
@llvm.used = appending global [1 x ptr] [ptr @var], section "llvm.metadata"
@llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr null, ptr null }]
4 changes: 2 additions & 2 deletions llvm/test/Object/X86/irsymtab-bad-alias.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ target triple = "x86_64-unknown-linux-gnu"
@g1 = global i32 1
@g2 = global i32 2

@a = alias i32, inttoptr(i32 sub (i32 ptrtoint (i32* @g1 to i32),
i32 ptrtoint (i32* @g2 to i32)) to i32*)
@a = alias i32, inttoptr(i32 sub (i32 ptrtoint (ptr @g1 to i32),
i32 ptrtoint (ptr @g2 to i32)) to ptr)
10 changes: 5 additions & 5 deletions llvm/test/Object/X86/nm-ir.ll
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ module asm ".long undef_asm_sym"
@g3 = common global i32 0
@g4 = private global i32 42

@a1 = alias i32, i32* @g1
@a2 = internal alias i32, i32* @g1
@a1 = alias i32, ptr @g1
@a2 = internal alias i32, ptr @g1

define void ()* @f1() {
define ptr @f1() {
call void @f5()
ret void ()* null
ret ptr null
}

@ifunc_f1 = ifunc void (), void ()* ()* @f1
@ifunc_f1 = ifunc void (), ptr @f1

define internal void @f2() {
ret void
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Object/dllimport-globalref.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ target triple = "x86_64-pc-windows-msvc"
; CHECK: U f

declare dllimport void @f()
@fp = constant void ()* @f
@fp = constant ptr @f
2 changes: 1 addition & 1 deletion llvm/test/Object/dllimport.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ declare dllimport void @f()

define void @g() {
call void @f()
store i32 42, i32* @v
store i32 42, ptr @v
ret void
}
4 changes: 2 additions & 2 deletions llvm/test/Object/mangle-ir.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ target datalayout = "m:o"
; CHECK-NOT: memcpy

define void @f() {
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* null, i8* null, i64 0, i1 false)
tail call void @llvm.memcpy.p0.p0.i64(ptr null, ptr null, i64 0, i1 false)
ret void
}

declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1)
declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1)
8 changes: 4 additions & 4 deletions llvm/test/Object/objc-swift-mixed-imageinfo-macho.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@

target triple = "x86_64-apple-macosx10.15.0"

@llvm.used = appending global [1 x i8*] [i8* bitcast (i16* @__swift_reflection_version to i8*)], section "llvm.metadata", align 8
@llvm.used = appending global [1 x ptr] [ptr @__swift_reflection_version], section "llvm.metadata", align 8
@__swift_reflection_version = linkonce_odr hidden constant i16 3

define i32 @main(i32 %0, i8** %1) #0 {
%3 = bitcast i8** %1 to i8*
define i32 @main(i32 %0, ptr %1) #0 {
%3 = bitcast ptr %1 to ptr
ret i32 0
}

Expand All @@ -25,7 +25,7 @@ attributes #0 = { "frame-pointer"="all" "target-cpu"="penryn" "target-features"=
!1 = !{!"-lswiftSwiftOnoneSupport"}
!2 = !{!"-lswiftCore"}
!3 = !{!"-lobjc"}
!4 = !{[1 x i8*]* @llvm.used, null, null, i1 false, i1 true}
!4 = !{ptr @llvm.used, null, null, i1 false, i1 true}
!5 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 15]}
!6 = !{i32 1, !"Objective-C Version", i32 2}
!7 = !{i32 1, !"Objective-C Image Info Version", i32 0}
Expand Down
34 changes: 17 additions & 17 deletions llvm/test/tools/llvm-readobj/ELF/unwind.test
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@

# CHECK: Program:
# CHECK-NEXT: DW_CFA_def_cfa_offset: +16
# CHECK-NEXT: DW_CFA_advance_loc: 6
# CHECK-NEXT: DW_CFA_advance_loc: 6 to 0x4004a6
# CHECK-NEXT: DW_CFA_def_cfa_offset: +24
# CHECK-NEXT: DW_CFA_advance_loc: 10
# CHECK-NEXT: DW_CFA_advance_loc: 10 to 0x4004b0
# CHECK-NEXT: DW_CFA_def_cfa_expression: DW_OP_breg7 +8, DW_OP_breg16 +0, DW_OP_lit15, DW_OP_and, DW_OP_lit11, DW_OP_ge, DW_OP_lit3, DW_OP_shl, DW_OP_plus
# CHECK-NEXT: DW_CFA_nop:
# CHECK-NEXT: DW_CFA_nop:
Expand All @@ -110,12 +110,12 @@
# CHECK-NEXT: address_range: 0x10 (end : 0x4005c6)

# CHECK: Program:
# CHECK-NEXT: DW_CFA_advance_loc: 1
# CHECK-NEXT: DW_CFA_advance_loc: 1 to 0x4005b7
# CHECK-NEXT: DW_CFA_def_cfa_offset: +16
# CHECK-NEXT: DW_CFA_offset: reg6 -16
# CHECK-NEXT: DW_CFA_advance_loc: 3
# CHECK-NEXT: DW_CFA_advance_loc: 3 to 0x4005ba
# CHECK-NEXT: DW_CFA_def_cfa_register: reg6
# CHECK-NEXT: DW_CFA_advance_loc: 11
# CHECK-NEXT: DW_CFA_advance_loc: 11 to 0x4005c5
# CHECK-NEXT: DW_CFA_def_cfa: reg7 +8
# CHECK-NEXT: DW_CFA_nop:
# CHECK-NEXT: DW_CFA_nop:
Expand All @@ -126,15 +126,15 @@
# CHECK-NEXT: address_range: 0xc7f (end : 0x40124f)

# CHECK: Program:
# CHECK-NEXT: DW_CFA_advance_loc: 5
# CHECK-NEXT: DW_CFA_advance_loc: 5 to 0x4005d5
# CHECK-NEXT: DW_CFA_def_cfa: reg10 +0
# CHECK-NEXT: DW_CFA_advance_loc: 9
# CHECK-NEXT: DW_CFA_advance_loc: 9 to 0x4005de
# CHECK-NEXT: DW_CFA_expression: reg6 DW_OP_breg6 +0
# CHECK-NEXT: DW_CFA_advance_loc: 5
# CHECK-NEXT: DW_CFA_advance_loc: 5 to 0x4005e3
# CHECK-NEXT: DW_CFA_def_cfa_expression: DW_OP_breg6 -8, DW_OP_deref
# CHECK-NEXT: DW_CFA_advance_loc2: 3174
# CHECK-NEXT: DW_CFA_advance_loc2: 3174 to 0x401249
# CHECK-NEXT: DW_CFA_def_cfa: reg10 +0
# CHECK-NEXT: DW_CFA_advance_loc: 5
# CHECK-NEXT: DW_CFA_advance_loc: 5 to 0x40124e
# CHECK-NEXT: DW_CFA_def_cfa: reg7 +8
# CHECK-NEXT: DW_CFA_nop:
# CHECK-NEXT: DW_CFA_nop:
Expand All @@ -146,21 +146,21 @@
# CHECK-NEXT: address_range: 0x66 (end : 0x4012b6)

# CHECK: Program:
# CHECK-NEXT: DW_CFA_advance_loc: 1
# CHECK-NEXT: DW_CFA_advance_loc: 1 to 0x401251
# CHECK-NEXT: DW_CFA_def_cfa_offset: +16
# CHECK-NEXT: DW_CFA_offset: reg6 -16
# CHECK-NEXT: DW_CFA_advance_loc: 3
# CHECK-NEXT: DW_CFA_advance_loc: 3 to 0x401254
# CHECK-NEXT: DW_CFA_def_cfa_register: reg6
# CHECK-NEXT: DW_CFA_advance_loc: 2
# CHECK-NEXT: DW_CFA_advance_loc: 2 to 0x401256
# CHECK-NEXT: DW_CFA_offset: reg15 -24
# CHECK-NEXT: DW_CFA_advance_loc: 5
# CHECK-NEXT: DW_CFA_advance_loc: 5 to 0x40125b
# CHECK-NEXT: DW_CFA_offset: reg14 -32
# CHECK-NEXT: DW_CFA_advance_loc: 7
# CHECK-NEXT: DW_CFA_advance_loc: 7 to 0x401262
# CHECK-NEXT: DW_CFA_offset: reg13 -40
# CHECK-NEXT: DW_CFA_offset: reg12 -48
# CHECK-NEXT: DW_CFA_advance_loc: 8
# CHECK-NEXT: DW_CFA_advance_loc: 8 to 0x40126a
# CHECK-NEXT: DW_CFA_offset: reg3 -56
# CHECK-NEXT: DW_CFA_advance_loc1: 75
# CHECK-NEXT: DW_CFA_advance_loc1: 75 to 0x4012b5
# CHECK-NEXT: DW_CFA_def_cfa: reg7 +8
# CHECK-NEXT: DW_CFA_nop:
# CHECK-NEXT: DW_CFA_nop:
Expand Down
7 changes: 5 additions & 2 deletions llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ void PrinterContext<ELFT>::printEHFrame(const Elf_Shdr *EHFrameShdr) const {
reportError(std::move(E), ObjF.getFileName());

for (const dwarf::FrameEntry &Entry : EHFrame) {
std::optional<uint64_t> InitialLocation;
if (const dwarf::CIE *CIE = dyn_cast<dwarf::CIE>(&Entry)) {
W.startLine() << format("[0x%" PRIx64 "] CIE length=%" PRIu64 "\n",
Address + CIE->getOffset(), CIE->getLength());
Expand All @@ -214,8 +215,9 @@ void PrinterContext<ELFT>::printEHFrame(const Elf_Shdr *EHFrameShdr) const {
Address + FDE->getLinkedCIE()->getOffset());
W.indent();

InitialLocation = FDE->getInitialLocation();
W.startLine() << format("initial_location: 0x%" PRIx64 "\n",
FDE->getInitialLocation());
*InitialLocation);
W.startLine() << format(
"address_range: 0x%" PRIx64 " (end : 0x%" PRIx64 ")\n",
FDE->getAddressRange(),
Expand All @@ -227,7 +229,8 @@ void PrinterContext<ELFT>::printEHFrame(const Elf_Shdr *EHFrameShdr) const {
W.indent();
auto DumpOpts = DIDumpOptions();
DumpOpts.IsEH = true;
Entry.cfis().dump(W.getOStream(), DumpOpts, W.getIndentLevel());
Entry.cfis().dump(W.getOStream(), DumpOpts, W.getIndentLevel(),
InitialLocation);
W.unindent();
W.unindent();
W.getOStream() << "\n";
Expand Down
26 changes: 26 additions & 0 deletions mlir/include/mlir/Dialect/Linalg/Transforms/AllInterfaces.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
//===- AllInterfaces.h - ----------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines a common entry point for registering all external
// interface implementations to the linalg dialect.
//
//===----------------------------------------------------------------------===//

#ifndef MLIR_DIALECT_LINALG_TRANSFORMS_ALLINTERFACES_H
#define MLIR_DIALECT_LINALG_TRANSFORMS_ALLINTERFACES_H

namespace mlir {
class DialectRegistry;

namespace linalg {
void registerAllDialectInterfaceImplementations(DialectRegistry &registry);
} // namespace linalg

} // namespace mlir

#endif // MLIR_DIALECT_LINALG_TRANSFORMS_ALLINTERFACES_H
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//===- MeshShardingInterfaceImpl.h ----------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef MLIR_DIALECT_LINALG_MESHSHARDINGINTERFACEIMPL_H
#define MLIR_DIALECT_LINALG_MESHSHARDINGINTERFACEIMPL_H

namespace mlir {
class DialectRegistry;

namespace linalg {
void registerMeshShardingInterfaceExternalModels(DialectRegistry &registry);
} // namespace linalg
} // namespace mlir

#endif // MLIR_DIALECT_LINALG_MESHSHARDINGINTERFACEIMPL_H
6 changes: 6 additions & 0 deletions mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ def Mesh_ReductionKind : I32EnumAttr<"ReductionKind",
I32EnumAttrCase<"Sum", 1, "sum">,
I32EnumAttrCase<"Max", 2, "max">,
I32EnumAttrCase<"Min", 3, "min">,
I32EnumAttrCase<"Product", 4, "product">,
// Arithmetic mean.
I32EnumAttrCase<"Average", 5, "average">,
I32EnumAttrCase<"BitwiseAnd", 6, "bitwise_and">,
I32EnumAttrCase<"BitwiseOr", 7, "bitwise_or">,
I32EnumAttrCase<"BitwiseXor", 8, "bitwise_xor">,
I32EnumAttrCase<"Generic", 100, "generic">
]> {
let genSpecializedAttr = 0;
Expand Down
4 changes: 4 additions & 0 deletions mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,10 @@ def Mesh_AllReduceOp : Mesh_CollectiveCommunicationOpBase<"all_reduce", [
attr-dict `:` type($input) `->` type($result)
}];
let hasCanonicalizer = 1;
let builders = [
OpBuilder<(ins "Value":$input, "StringRef":$mesh,
"ArrayRef<MeshAxis>":$meshAxes, "ReductionKind":$reduction)>
];
}

def Mesh_AllSliceOp : Mesh_CollectiveCommunicationOpBase<"all_slice", [
Expand Down
18 changes: 18 additions & 0 deletions mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterfaceImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,24 @@ class SymbolTableCollection;

namespace mesh {

// Retrieve the mesh axes corresponding to each operation loop iterator based
// on the provided shardings for the op's operands and results.
// Assumes that the indexingMaps are projected permutations.
ShardingArray getMeshAxisAssignmentForLoopIterators(
ArrayRef<MeshShardingAttr> operandShardings,
ArrayRef<MeshShardingAttr> resultShardings,
ArrayRef<utils::IteratorType> loopIteratorTypes,
ArrayRef<AffineMap> indexingMaps);

bool isAtLeastOneReductionIteratorSharded(
ArrayRef<utils::IteratorType> loopIteratorTypes,
ArrayRef<SmallVector<MeshAxis>> meshAxisAssignmentForLoopIterators);

// Get the set of mesh axes that correspond to reduction loop iterators.
SmallVector<MeshAxis> getReductionMeshAxes(
ArrayRef<utils::IteratorType> loopIteratorTypes,
ArrayRef<SmallVector<MeshAxis>> meshAxisAssignmentForLoopIterators);

// Inserts a clone of the operation that has all ranked tensor
// arguments/results sharded.
void spmdizeTriviallyShardableOperation(
Expand Down
6 changes: 6 additions & 0 deletions mlir/include/mlir/Dialect/Mesh/Transforms/Transforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Value.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/ArrayRef.h"

namespace mlir {
class RewritePatternSet;
Expand All @@ -37,6 +38,11 @@ TypedValue<IndexType>
createCollectiveProcessGroupSize(MeshOp mesh, ArrayRef<MeshAxis> axes,
ImplicitLocOpBuilder &builder);

// Get process linear index along the given mesh axes.
TypedValue<IndexType> createProcessLinearIndex(StringRef mesh,
ArrayRef<MeshAxis> meshAxes,
ImplicitLocOpBuilder &builder);

} // namespace mesh
} // namespace mlir

Expand Down
8 changes: 8 additions & 0 deletions mlir/include/mlir/IR/Dialect.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,14 @@ class Dialect {
{TypeID::get<ConcreteT>(), InterfaceT::getInterfaceID()});
}

// Declare the same interface for multiple types.
// Example:
// declarePromisedInterfaces<FunctionOpInterface, MyFuncType1, MyFuncType2>()
template <typename InterfaceT, typename... ConcreteT>
void declarePromisedInterfaces() {
(declarePromisedInterface<ConcreteT, InterfaceT>(), ...);
}

/// Checks if the given interface, which is attempting to be used, is a
/// promised interface of this dialect that has yet to be implemented. If so,
/// emits a fatal error. `interfaceName` is an optional string that contains a
Expand Down
10 changes: 2 additions & 8 deletions mlir/include/mlir/InitAllDialects.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,7 @@
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/IR/ValueBoundsOpInterfaceImpl.h"
#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
#include "mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h"
#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
#include "mlir/Dialect/Linalg/Transforms/AllInterfaces.h"
#include "mlir/Dialect/MLProgram/IR/MLProgram.h"
#include "mlir/Dialect/MLProgram/Transforms/BufferizableOpInterfaceImpl.h"
#include "mlir/Dialect/MPI/IR/MPI.h"
Expand Down Expand Up @@ -157,10 +154,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
cf::registerBufferizableOpInterfaceExternalModels(registry);
cf::registerBufferDeallocationOpInterfaceExternalModels(registry);
gpu::registerBufferDeallocationOpInterfaceExternalModels(registry);
linalg::registerBufferizableOpInterfaceExternalModels(registry);
linalg::registerSubsetOpInterfaceExternalModels(registry);
linalg::registerTilingInterfaceExternalModels(registry);
linalg::registerValueBoundsOpInterfaceExternalModels(registry);
linalg::registerAllDialectInterfaceImplementations(registry);
memref::registerAllocationOpInterfaceExternalModels(registry);
memref::registerRuntimeVerifiableOpInterfaceExternalModels(registry);
memref::registerValueBoundsOpInterfaceExternalModels(registry);
Expand Down
33 changes: 33 additions & 0 deletions mlir/include/mlir/Transforms/DialectConversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,39 @@ struct ConversionConfig {
/// IR during an analysis conversion and only pre-existing operations are
/// added to the set.
DenseSet<Operation *> *legalizableOps = nullptr;

/// An optional listener that is notified about all IR modifications in case
/// dialect conversion succeeds. If the dialect conversion fails and no IR
/// modifications are visible (i.e., they were all rolled back), no
/// notifications are sent.
///
/// Note: Notifications are sent in a delayed fashion, when the dialect
/// conversion is guaranteed to succeed. At that point, some IR modifications
/// may already have been materialized. Consequently, operations/blocks that
/// are passed to listener callbacks should not be accessed. (Ops/blocks are
/// guaranteed to be valid pointers and accessing op names is allowed. But
/// there are no guarantees about the state of ops/blocks at the time that a
/// callback is triggered.)
///
/// Example: Consider a dialect conversion a new op ("test.foo") is created
/// and inserted, and later moved to another block. (Moving ops also triggers
/// "notifyOperationInserted".)
///
/// (1) notifyOperationInserted: "test.foo" (into block "b1")
/// (2) notifyOperationInserted: "test.foo" (moved to another block "b2")
///
/// When querying "op->getBlock()" during the first "notifyOperationInserted",
/// "b2" would be returned because "moving an op" is a kind of rewrite that is
/// immediately performed by the dialect conversion (and rolled back upon
/// failure).
//
// Note: When receiving a "notifyBlockInserted"/"notifyOperationInserted"
// callback, the previous region/block is provided to the callback, but not
// the iterator pointing to the exact location within the region/block. That
// is because these notifications are sent with a delay (after the IR has
// already been modified) and iterators into past IR state cannot be
// represented at the moment.
RewriterBase::Listener *listener = nullptr;
};

//===----------------------------------------------------------------------===//
Expand Down
1 change: 1 addition & 0 deletions mlir/lib/Dialect/Linalg/IR/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ add_mlir_dialect_library(MLIRLinalgDialect
MLIRInferTypeOpInterface
MLIRIR
MLIRParser
MLIRShardingInterface
MLIRSideEffectInterfaces
MLIRSparseTensorDialect
MLIRSCFDialect
Expand Down
7 changes: 7 additions & 0 deletions mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Dialect.h"
Expand Down Expand Up @@ -118,6 +119,12 @@ void mlir::linalg::LinalgDialect::initialize() {
>(namedStructuredOpRegionBuilders);

addInterfaces<LinalgInlinerInterface>();

declarePromisedInterface<GenericOp, mesh::ShardingInterface>();
declarePromisedInterfaces<mesh::ShardingInterface,
#define GET_OP_LIST
#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
>();
}

LogicalResult LinalgDialect::verifyOperationAttribute(Operation *op,
Expand Down
24 changes: 24 additions & 0 deletions mlir/lib/Dialect/Linalg/Transforms/AllInterfaces.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//===- AllInterfaces.cpp - ------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Linalg/Transforms/AllInterfaces.h"

#include "mlir/Dialect/Linalg/IR/ValueBoundsOpInterfaceImpl.h"
#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
#include "mlir/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.h"
#include "mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h"
#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"

void mlir::linalg::registerAllDialectInterfaceImplementations(
DialectRegistry &registry) {
registerBufferizableOpInterfaceExternalModels(registry);
registerMeshShardingInterfaceExternalModels(registry);
registerSubsetOpInterfaceExternalModels(registry);
registerTilingInterfaceExternalModels(registry);
registerValueBoundsOpInterfaceExternalModels(registry);
}
5 changes: 5 additions & 0 deletions mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
add_mlir_dialect_library(MLIRLinalgTransforms
AllInterfaces.cpp
BubbleUpExtractSlice.cpp
BufferizableOpInterfaceImpl.cpp
Bufferize.cpp
Expand All @@ -21,6 +22,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
InlineScalarOperands.cpp
Interchange.cpp
Loops.cpp
MeshShardingInterfaceImpl.cpp
NamedOpConversions.cpp
Padding.cpp
Promotion.cpp
Expand Down Expand Up @@ -61,12 +63,15 @@ add_mlir_dialect_library(MLIRLinalgTransforms
MLIRIR
MLIRMemRefDialect
MLIRMemRefTransforms
MLIRMeshDialect
MLIRMeshTransforms
MLIRLinalgDialect
MLIRLinalgUtils
MLIRSCFDialect
MLIRSCFTransforms
MLIRSCFUtils
MLIRPass
MLIRShardingInterface
MLIRSubsetOpInterface
MLIRSparseTensorDialect
MLIRTensorDialect
Expand Down
353 changes: 353 additions & 0 deletions mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,353 @@
//===- MeshShardingInterfaceImpl.cpp --------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.h"

#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
#include "mlir/Dialect/Mesh/IR/MeshOps.h"
#include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.h"
#include "mlir/Dialect/Mesh/Interfaces/ShardingInterfaceImpl.h"
#include "mlir/Dialect/Mesh/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/DialectRegistry.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/IR/ImplicitLocOpBuilder.h"
#include "mlir/IR/MLIRContext.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/SymbolTable.h"
#include "mlir/IR/Value.h"
#include "mlir/Interfaces/TilingInterface.h"
#include "mlir/Support/LogicalResult.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/TypeSwitch.h"
#include <iterator>
#include <optional>
#include <utility>

namespace mlir::linalg {

using MeshAxis = mesh::MeshAxis;
using ReductionKind = mesh::ReductionKind;
using MeshShardingAttr = mesh::MeshShardingAttr;
using ShardingArray = mesh::ShardingArray;
using MeshOp = mesh::MeshOp;

// Returns the corresponding mesh reduction kind for the given arith op.
static ReductionKind getReductionKind(Operation *op) {
return llvm::TypeSwitch<Operation *, ReductionKind>(op)
// Floating-point operations.
.Case([](arith::AddFOp op) { return ReductionKind::Sum; })
.Case([](arith::MulFOp op) { return ReductionKind::Product; })
// TODO: handle maxnumf and minnumf.
.Case([](arith::MaximumFOp op) { return ReductionKind::Max; })
.Case([](arith::MinimumFOp op) { return ReductionKind::Min; })
// Integer operations.
.Case([](arith::AddIOp op) { return ReductionKind::Sum; })
.Case([](arith::OrIOp op) { return ReductionKind::BitwiseOr; })
.Case([](arith::XOrIOp op) { return ReductionKind::BitwiseXor; })
.Case([](arith::AndIOp op) { return ReductionKind::Sum; })
// TODO: handle signless, signed and unsigned types properly.
// It is assumed that the element type of the collective operands and
// result drive the meaning of the reduction kind, whether it is signed
// or unsigned.
// The reduction op inside the linalg op may have different result type
// from the element type of the linalg op's result.
// Also signed and unsigned Arith dialect ops may accept signed, unsigned
// or signless operands.
// Maybe expand the reduction kinds.
.Case([](arith::MaxUIOp op) { return ReductionKind::Max; })
.Case([](arith::MinUIOp op) { return ReductionKind::Min; })
.Case([](arith::MaxSIOp op) { return ReductionKind::Max; })
.Case([](arith::MinSIOp op) { return ReductionKind::Min; })
.Case([](arith::MulIOp op) { return ReductionKind::Product; })
.Default([](Operation *op) { return ReductionKind::Generic; });
}

static std::optional<Operation *> getCombinerOp(LinalgOp op) {
SmallVector<Operation *> combinerOps;
Value reducedValue = matchReduction(op.getRegionOutputArgs(), 0, combinerOps);
if (!reducedValue || combinerOps.size() != 1) {
return std::nullopt;
}

return combinerOps[0];
}

static ReductionKind getReductionKindOfLinalgOp(LinalgOp op) {
std::optional<Operation *> reductionOp = getCombinerOp(op);
if (!reductionOp) {
return ReductionKind::Generic;
}
[[maybe_unused]] Type resultElementType =
llvm::cast<RankedTensorType>(op->getResult(0).getType()).getElementType();
// TODO: handle case when result type of the reduction op does not match the
// element type of the result tensor.
// Would it makes sense at all?
assert(resultElementType == reductionOp.value()->getResult(0).getType());
return getReductionKind(reductionOp.value());
}

static MeshOp getMesh(Operation *op,
ArrayRef<MeshShardingAttr> operandShardings,
ArrayRef<MeshShardingAttr> resultShardings,
SymbolTableCollection &symbolTable) {
for (MeshShardingAttr sharding : operandShardings) {
if (sharding) {
return mesh::getMesh(op, sharding.getMesh(), symbolTable);
}
}

for (MeshShardingAttr sharding : resultShardings) {
if (sharding) {
return mesh::getMesh(op, sharding.getMesh(), symbolTable);
}
}

assert(false);
return nullptr;
}

// Choose the operand based on the current process index along the reduction
// mesh axes.
// We need to use the initial value only once to avoid including it in the
// reduction multiple times.
// In each process group only the leading process with linear index 0 would use
// the original operand.
// The other processes would use the reduction operation neutral tensor.
static Value createDestinationPassingStyleInitOperand(
LinalgOp op, Value spmdizedOperand, ArrayRef<MeshAxis> reductionMeshAxes,
MeshOp meshOp, ImplicitLocOpBuilder &builder) {
Value processLinearIndexInReductionGroup = mesh::createProcessLinearIndex(
meshOp.getSymName(), reductionMeshAxes, builder);
Value zero = builder.create<arith::ConstantIndexOp>(0);
Value isLeadProcess = builder.create<arith::CmpIOp>(
builder.getI1Type(), arith::CmpIPredicate::eq,
processLinearIndexInReductionGroup, zero);
scf::IfOp ifOp = builder.create<scf::IfOp>(spmdizedOperand.getType(),
isLeadProcess, true, true);
// Then block.
{
OpBuilder::InsertionGuard insertionGuard(builder);
builder.setInsertionPointToEnd(&ifOp.getThenRegion().front());
builder.create<scf::YieldOp>(spmdizedOperand);
}

// Else block.
{
OpBuilder::InsertionGuard insertionGuard(builder);
builder.setInsertionPointToEnd(&ifOp.getElseRegion().front());
SmallVector<OpFoldResult> shape =
tensor::getMixedSizes(builder, builder.getLoc(), spmdizedOperand);
PartialReductionOpInterface partialReductionIface =
llvm::cast<PartialReductionOpInterface>(op.getOperation());
FailureOr<Operation *> reductionNeutralTensorOp =
partialReductionIface.generateInitialTensorForPartialReduction(
builder, builder.getLoc(), shape, {});
assert(succeeded(reductionNeutralTensorOp));
builder.create<scf::YieldOp>(
reductionNeutralTensorOp.value()->getResult(0));
}
return ifOp.getResult(0);
}

// Create the DPS init operands for the spmdized Linalg op.
// Return all the new spmdized operands.
static SmallVector<Value> createDestinationPassingStyleInitOperands(
LinalgOp op, MeshOp meshOp, ArrayRef<Value> spmdizedOperands,
ArrayRef<MeshAxis> reductionMeshAxes, IRMapping &spmdizationMap,
ImplicitLocOpBuilder &builder) {
// TODO: add support for multiple destination passing style initial value
// operands.
// PartialReductionOpInterface::generateInitialTensorForPartialReduction
// needs to also support multiple DPS initial operands.
SmallVector<Value> newOperands = llvm::to_vector(spmdizedOperands);
auto operandIdx = op.getDpsInitOperand(0)->getOperandNumber();
Value spmdizedInitOperand =
spmdizationMap.lookup(op->getOperands()[operandIdx]);
newOperands[operandIdx] = createDestinationPassingStyleInitOperand(
op, spmdizedInitOperand, reductionMeshAxes, meshOp, builder);
return newOperands;
}

static void createAllReduceForResultWithoutPartialSharding(
Value unshardedLinalgOpResult, ArrayRef<MeshAxis> opReductionMeshAxes,
MeshShardingAttr resultSharding, ReductionKind reductionKind,
IRMapping &spmdizationMap, ImplicitLocOpBuilder &builder) {
SmallVector<MeshAxis> allReduceMeshAxes;
llvm::copy_if(opReductionMeshAxes, std::back_inserter(allReduceMeshAxes),
[&resultSharding](MeshAxis axis) {
return !llvm::is_contained(resultSharding.getPartialAxes(),
axis);
});
if (allReduceMeshAxes.empty()) {
return;
}

Value spmdizedLinalgOpResult = spmdizationMap.lookup(unshardedLinalgOpResult);
Value reducedValue = builder.create<mesh::AllReduceOp>(
spmdizedLinalgOpResult, resultSharding.getMesh().getValue(),
allReduceMeshAxes, reductionKind);
spmdizationMap.map(unshardedLinalgOpResult, reducedValue);
}

static void createAllReduceForResultsWithoutPartialShardings(
LinalgOp unshardedOp, ArrayRef<MeshAxis> opReductionMeshAxes,
ArrayRef<MeshShardingAttr> resultShardings, IRMapping &spmdizationMap,
ImplicitLocOpBuilder &builder) {
ReductionKind reductionKind = getReductionKindOfLinalgOp(unshardedOp);
for (auto [unshardedLinalgOpResult, resultSharding] :
llvm::zip_equal(unshardedOp->getResults(), resultShardings)) {
createAllReduceForResultWithoutPartialSharding(
unshardedLinalgOpResult, opReductionMeshAxes, resultSharding,
reductionKind, spmdizationMap, builder);
}
}

static void spmdizeLinalgOpWithShardedReduction(
LinalgOp op, ArrayRef<Value> spmdizedOperands,
ArrayRef<MeshShardingAttr> operandShardings,
ArrayRef<MeshShardingAttr> resultShardings,
ArrayRef<utils::IteratorType> loopIteratorTypes,
ArrayRef<SmallVector<MeshAxis>> meshAxisAssignmentForLoopIterators,
IRMapping &spmdizationMap, SymbolTableCollection &symbolTable,
ImplicitLocOpBuilder &builder) {
MeshOp mesh = getMesh(op, operandShardings, resultShardings, symbolTable);
SmallVector<MeshAxis> reductionMeshAxes = mesh::getReductionMeshAxes(
loopIteratorTypes, meshAxisAssignmentForLoopIterators);
SmallVector<Value> spmdizedLinalgOpOperands =
createDestinationPassingStyleInitOperands(op, mesh, spmdizedOperands,
reductionMeshAxes,
spmdizationMap, builder);
// We must not change the operand mappings of the original spmdizationMap as
// they are the mappings for the whole spmdization blob and may be used by
// others.
IRMapping internalSpmdizationMap;
for (auto [unshardedOperand, spmdizedOperand] :
llvm::zip_equal(op->getOperands(), spmdizedLinalgOpOperands)) {
internalSpmdizationMap.map(unshardedOperand, spmdizedOperand);
}
spmdizeTriviallyShardableOperation(
*op, spmdizedLinalgOpOperands, operandShardings, resultShardings,
internalSpmdizationMap, symbolTable, builder);
for (Value result : op->getResults()) {
spmdizationMap.map(result, internalSpmdizationMap.lookup(result));
}

// Handle partial shardings.
createAllReduceForResultsWithoutPartialShardings(
op, reductionMeshAxes, resultShardings, spmdizationMap, builder);
}

namespace {

// ShardingInterface for ops that implement LinalgStructuredInterface.
// The supported ops are only those where the indexing maps are projected
// permutations.
template <typename Op>
struct StructuredOpShardingInterface
: public mesh::ShardingInterface::ExternalModel<
StructuredOpShardingInterface<Op>, Op> {
SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {
return llvm::cast<LinalgOp>(op).getIteratorTypesArray();
}

SmallVector<AffineMap> getIndexingMaps(Operation *op) const {
LinalgOp linalgOp = llvm::cast<LinalgOp>(op);
SmallVector<AffineMap> res = linalgOp.getIndexingMapsArray();

// Results must have the same indexing as destination passing style initial
// operands.
for (int64_t i = 0; i < linalgOp.getNumDpsInits(); ++i) {
res.push_back(res[linalgOp.getDpsInitOperand(i)->getOperandNumber()]);
}

return res;
}

LogicalResult spmdize(Operation *op, ArrayRef<Value> spmdizedOperands,
ArrayRef<MeshShardingAttr> operandShardings,
ArrayRef<MeshShardingAttr> resultShardings,
IRMapping &spmdizationMap,
SymbolTableCollection &symbolTable,
OpBuilder &builder) const {
LinalgOp linalgOp = llvm::cast<LinalgOp>(op);

SmallVector<AffineMap> indexingMaps = linalgOp.getIndexingMapsArray();
bool allIndexingMapsAreProjectedPermutation =
llvm::all_of(indexingMaps, [](AffineMap map) {
return map.isProjectedPermutation();
});
if (!allIndexingMapsAreProjectedPermutation) {
// TODO: handle non-projected permutations.
return op->emitOpError()
<< "supports indexing maps that are only projected permutation.";
}

SmallVector<utils::IteratorType> loopIteratorTypes =
linalgOp.getIteratorTypesArray();
ShardingArray meshAxisAssignmentForLoopIterators =
getMeshAxisAssignmentForLoopIterators(operandShardings, resultShardings,
loopIteratorTypes, indexingMaps);
if (mesh::isAtLeastOneReductionIteratorSharded(
loopIteratorTypes, meshAxisAssignmentForLoopIterators)) {
ImplicitLocOpBuilder implicitLocBuilder(op->getLoc(), builder);
spmdizeLinalgOpWithShardedReduction(
linalgOp, spmdizedOperands, operandShardings, resultShardings,
loopIteratorTypes, meshAxisAssignmentForLoopIterators, spmdizationMap,
symbolTable, implicitLocBuilder);
} else {
spmdizeTriviallyShardableOperation(*op, spmdizedOperands,
operandShardings, resultShardings,
spmdizationMap, symbolTable, builder);
}

return success();
}
};

} // namespace

template <typename OpType>
static void registerOne(MLIRContext *ctx) {
OpType::template attachInterface<StructuredOpShardingInterface<OpType>>(*ctx);
}

/// Variadic helper function.
template <typename... OpTypes>
static void registerAll(MLIRContext *ctx) {
(registerOne<OpTypes>(ctx), ...);
}

void registerMeshShardingInterfaceExternalModels(DialectRegistry &registry) {
registry.addExtension(+[](MLIRContext *ctx, LinalgDialect *dialect) {
DialectRegistry registry;
registry.insert<affine::AffineDialect, arith::ArithDialect, scf::SCFDialect,
tensor::TensorDialect>();
ctx->appendDialectRegistry(registry);
for (StringRef name : registry.getDialectNames())
ctx->getOrLoadDialect(name);

registerOne<linalg::GenericOp>(ctx);
registerAll<
#define GET_OP_LIST
#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
>(ctx);
});
}

} // namespace mlir::linalg
8 changes: 0 additions & 8 deletions mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,14 +275,6 @@ struct LinalgOpPartialReductionInterface
ArrayRef<int64_t> oldShape =
linalgOp.getShape(linalgOp.getDpsInitOperand(0));

// Extend tile size vector to the rank of the output tensor.
SmallVector<Value> tileSizeVector =
getValueOrCreateConstantIndexOp(b, loc, sizes);
if (tileSizeVector.size() < oldShape.size()) {
auto zero = b.create<arith::ConstantIndexOp>(loc, 0);
tileSizeVector.append(oldShape.size() - tileSizeVector.size(), zero);
}

// Calculate the new shape, we insert the new dimensions based on the index
// of the reduction dimensions.
SmallVector<int64_t> newOutputShape;
Expand Down
7 changes: 7 additions & 0 deletions mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,13 @@ void AllReduceOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
patterns.add<EmptyMeshAxesCanonicalizationPattern<AllReduceOp>>(context);
}

void AllReduceOp::build(OpBuilder &odsBuilder, OperationState &odsState,
Value input, StringRef mesh,
ArrayRef<MeshAxis> meshAxes, ReductionKind reduction) {
build(odsBuilder, odsState, input.getType(), mesh, meshAxes, input,
reduction);
}

void AllReduceOp::getAsmResultNames(
function_ref<void(Value, StringRef)> setNameFn) {
setNameFn(getResult(), "all_reduce");
Expand Down
89 changes: 86 additions & 3 deletions mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,8 +539,9 @@ static bool areValuesCompatibleWithFullReplicationShardings(
if (std::size(values) != std::size(shardings)) {
return false;
}
return llvm::all_of(llvm::zip(std::forward<ValueRange>(values),
std::forward<MeshShardingAttrRage>(shardings)),
return llvm::all_of(llvm::zip_equal(
std::forward<ValueRange>(values),
std::forward<MeshShardingAttrRage>(shardings)),
[](auto valueAndSharding) {
return isValueCompatibleWithFullReplicationSharding(
std::get<0>(valueAndSharding),
Expand All @@ -563,6 +564,88 @@ void mesh::spmdizeFullyReplicatedOperation(
builder.clone(op, spmdizationMap);
}

static void updateMeshAxisAssignmentForLoopIterators(
ArrayRef<MeshAxis> meshAxesAssignmentForTensorAxis, AffineExpr indexingExpr,
SmallVector<std::optional<SmallVector<MeshAxis>>>
&meshAxesAssignmentForLoopIterators) {
AffineDimExpr affineDimExpr = cast<AffineDimExpr>(indexingExpr);
unsigned loopIteratorIdx = affineDimExpr.getPosition();
if (meshAxesAssignmentForLoopIterators[loopIteratorIdx]) {
assert(llvm::equal(meshAxesAssignmentForTensorAxis,
*meshAxesAssignmentForLoopIterators[loopIteratorIdx]));
} else {
meshAxesAssignmentForLoopIterators[loopIteratorIdx] =
llvm::to_vector(meshAxesAssignmentForTensorAxis);
}
}

ShardingArray mesh::getMeshAxisAssignmentForLoopIterators(
ArrayRef<MeshShardingAttr> operandShardings,
ArrayRef<MeshShardingAttr> resultShardings,
ArrayRef<utils::IteratorType> loopIteratorTypes,
ArrayRef<AffineMap> indexingMaps) {
SmallVector<std::optional<SmallVector<MeshAxis>>>
meshAxisAssignmentForLoopIterators(loopIteratorTypes.size());
SmallVector<MeshShardingAttr> operatorAndResultShardings;
operatorAndResultShardings.reserve(operandShardings.size() +
resultShardings.size());
llvm::append_range(operatorAndResultShardings, operandShardings);
for (auto [sharding, affineMap] :
llvm::zip_equal(operatorAndResultShardings, indexingMaps)) {
if (!sharding) {
continue;
}
for (auto [meshAxesAssignmentForTensorAxis, indexingExpr] :
llvm::zip(sharding.getSplitAxes(), affineMap.getResults())) {
updateMeshAxisAssignmentForLoopIterators(
meshAxesAssignmentForTensorAxis.asArrayRef(), indexingExpr,
meshAxisAssignmentForLoopIterators);
}
// Missing trailing split axes means replication on those tensor dimensions.
for (unsigned i = sharding.getSplitAxes().size();
i < affineMap.getNumResults(); ++i) {
updateMeshAxisAssignmentForLoopIterators(
{}, affineMap.getResults()[i], meshAxisAssignmentForLoopIterators);
}
}

ShardingArray res;
llvm::transform(meshAxisAssignmentForLoopIterators, std::back_inserter(res),
[](std::optional<SmallVector<MeshAxis>> &axes) {
if (!axes) {
return SmallVector<MeshAxis>();
};
return std::move(*axes);
});
return res;
}

bool mesh::isAtLeastOneReductionIteratorSharded(
ArrayRef<utils::IteratorType> loopIteratorTypes,
ArrayRef<SmallVector<MeshAxis>> meshAxisAssignmentForLoopIterators) {
for (auto [loopIteratorType, meshAxisAssignment] :
llvm::zip_equal(loopIteratorTypes, meshAxisAssignmentForLoopIterators)) {
if (loopIteratorType == utils::IteratorType::reduction &&
!meshAxisAssignment.empty()) {
return true;
}
}
return false;
}

SmallVector<MeshAxis> mesh::getReductionMeshAxes(
ArrayRef<utils::IteratorType> loopIteratorTypes,
ArrayRef<SmallVector<MeshAxis>> meshAxisAssignmentForLoopIterators) {
SmallVector<MeshAxis> meshAxes;
for (auto [loopIteratorType, meshAxisAssignment] :
llvm::zip_equal(loopIteratorTypes, meshAxisAssignmentForLoopIterators)) {
if (loopIteratorType == utils::IteratorType::reduction) {
llvm::append_range(meshAxes, meshAxisAssignment);
}
}
return meshAxes;
}

void mesh::spmdizeTriviallyShardableOperation(
Operation &op, ArrayRef<Value> spmdizedOperands,
ArrayRef<MeshShardingAttr> operandShardings,
Expand All @@ -572,7 +655,7 @@ void mesh::spmdizeTriviallyShardableOperation(
Operation *newOp = builder.clone(op, spmdizationMap);
// Set the result types to the sharded counterparts.
for (auto [oldResult, newResult, sharding] :
llvm::zip(op.getResults(), newOp->getResults(), resultShardings)) {
llvm::zip_equal(op.getResults(), newOp->getResults(), resultShardings)) {
newResult.setType(shardType(newResult.getType(),
getMesh(&op, sharding.getMesh(), symbolTable),
sharding));
Expand Down
13 changes: 13 additions & 0 deletions mlir/lib/Dialect/Mesh/Transforms/Transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,4 +208,17 @@ createCollectiveProcessGroupSize(MeshOp mesh, ArrayRef<MeshAxis> axes,
.cast<TypedValue<IndexType>>();
}

TypedValue<IndexType> createProcessLinearIndex(StringRef mesh,
ArrayRef<MeshAxis> meshAxes,
ImplicitLocOpBuilder &builder) {
ResultRange processInGroupMultiIndex =
builder.create<ProcessMultiIndexOp>(mesh, meshAxes).getResults();
Operation::result_range processGroupShape =
builder.create<MeshShapeOp>(mesh, meshAxes).getResult();
OpFoldResult processInGroupLinearIndex = affine::linearizeIndex(
llvm::to_vector_of<OpFoldResult>(processInGroupMultiIndex),
llvm::to_vector_of<OpFoldResult>(processGroupShape), builder);
return cast<TypedValue<IndexType>>(processInGroupLinearIndex.get<Value>());
}

} // namespace mlir::mesh
248 changes: 188 additions & 60 deletions mlir/lib/Transforms/Utils/DialectConversion.cpp

Large diffs are not rendered by default.

165 changes: 165 additions & 0 deletions mlir/test/Dialect/Linalg/mesh-spmdization.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
// RUN: mlir-opt \
// RUN: --mesh-spmdization \
// RUN: --test-constant-fold \
// RUN: --split-input-file \
// RUN: %s | FileCheck %s

// CHECK: #[[$MAP_IDENTITY_1D:.*]] = affine_map<(d0) -> (d0)>
#map_identity_1d = affine_map<(d0) -> (d0)>

mesh.mesh @mesh_1d(shape = 2)

// CHECK-LABEL: func @elementwise_static_1d_mesh_static_1d_tensor
func.func @elementwise_static_1d_mesh_static_1d_tensor(
// CHECK-SAME: %[[IN1:[A-Za-z0-9_]+]]: tensor<1xi8>,
%in1: tensor<2xi8>,
// CHECK-SAME: %[[IN2:[A-Za-z0-9_]+]]: tensor<1xi8>,
%in2: tensor<2xi8>,
// CHECK-SAME: %[[DPS_OUT:[A-Za-z0-9_]+]]: tensor<1xi8>
%dps_out: tensor<2xi8>
// CHECK-SAME: -> tensor<1xi8> {
) -> tensor<2xi8> {
%in1_shared1 = mesh.shard %in1 to <@mesh_1d, [[0]]> : tensor<2xi8>
%in1_shared2 = mesh.shard %in1_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<2xi8>
%in2_shared1 = mesh.shard %in2 to <@mesh_1d, [[0]]> : tensor<2xi8>
%in2_shared2 = mesh.shard %in2_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<2xi8>
%dps_out_shared1 = mesh.shard %dps_out to <@mesh_1d, [[0]]> : tensor<2xi8>
%dps_out_shared2 = mesh.shard %dps_out_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<2xi8>
// CHECK: %[[RES:.*]] = linalg.generic {
// CHECK-SAME: indexing_maps = [#[[$MAP_IDENTITY_1D]], #[[$MAP_IDENTITY_1D]], #[[$MAP_IDENTITY_1D]]],
// CHECK-SAME: iterator_types = ["parallel"]}
// CHECK-SAME: ins(%[[IN1]], %[[IN2]] : tensor<1xi8>, tensor<1xi8>)
// CHECK-SAME: outs(%[[DPS_OUT]] : tensor<1xi8>) {
%res = linalg.generic {
indexing_maps = [#map_identity_1d, #map_identity_1d, #map_identity_1d],
iterator_types = ["parallel"]
} ins(%in1_shared2, %in2_shared2 : tensor<2xi8>, tensor<2xi8>)
outs(%dps_out_shared2 : tensor<2xi8>) {
^bb0(%in1_scalar: i8, %in2_scalar: i8, %out: i8):
%res_scalar = arith.muli %in1_scalar, %in2_scalar : i8
linalg.yield %res_scalar : i8
} -> tensor<2xi8>
%res_shared1 = mesh.shard %res to <@mesh_1d, [[0]]> : tensor<2xi8>
%res_shared2 = mesh.shard %res_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<2xi8>
// CHECK: return %[[RES]] : tensor<1xi8>
return %res_shared2 : tensor<2xi8>
}

// -----

mesh.mesh @mesh_1d(shape = 4)

// CHECK-LABEL: func @matmul_1d_mesh_static_tensors_parallel_iterator_sharding
func.func @matmul_1d_mesh_static_tensors_parallel_iterator_sharding(
// CHECK-SAME: %[[IN1:[A-Za-z0-9_]+]]: tensor<1x3xi8>,
%in1: tensor<4x3xi8>,
// CHECK-SAME: %[[IN2:[A-Za-z0-9_]+]]: tensor<3x8xi8>,
%in2: tensor<3x8xi8>,
// CHECK-SAME: %[[DPS_OUT:[A-Za-z0-9_]+]]: tensor<1x8xi8>
%dps_out: tensor<4x8xi8>
// CHECK-SAME: -> tensor<1x8xi8> {
) -> tensor<4x8xi8> {
%in1_shared1 = mesh.shard %in1 to <@mesh_1d, [[0]]> : tensor<4x3xi8>
%in1_shared2 = mesh.shard %in1_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<4x3xi8>
%in2_shared1 = mesh.shard %in2 to <@mesh_1d, [[]]> : tensor<3x8xi8>
%in2_shared2 = mesh.shard %in2_shared1 to <@mesh_1d, [[]]> annotate_for_users: tensor<3x8xi8>
%dps_out_shared1 = mesh.shard %dps_out to <@mesh_1d, [[0]]> : tensor<4x8xi8>
%dps_out_shared2 = mesh.shard %dps_out_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<4x8xi8>
// CHECK: %[[RES:.*]] = linalg.matmul
// CHECK-SAME: ins(%[[IN1]], %[[IN2]] : tensor<1x3xi8>, tensor<3x8xi8>)
// CHECK-SAME: outs(%[[DPS_OUT]] : tensor<1x8xi8>)
// CHECK-SAME: -> tensor<1x8xi8>
%res = linalg.matmul ins(%in1_shared2, %in2_shared2 : tensor<4x3xi8>, tensor<3x8xi8>)
outs(%dps_out_shared2 : tensor<4x8xi8>) -> tensor<4x8xi8>
%res_shared1 = mesh.shard %res to <@mesh_1d, [[0]]> : tensor<4x8xi8>
%res_shared2 = mesh.shard %res_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<4x8xi8>
// CHECK: return %[[RES]] : tensor<1x8xi8>
return %res_shared2 : tensor<4x8xi8>
}

// -----

mesh.mesh @mesh_1d(shape = 3)

// CHECK-LABEL: func @matmul_1d_mesh_static_tensors_reduction_iterator_sharding
func.func @matmul_1d_mesh_static_tensors_reduction_iterator_sharding(
// CHECK-SAME: %[[IN1:[A-Za-z0-9_]+]]: tensor<4x2xi8>,
%in1: tensor<4x6xi8>,
// CHECK-SAME: %[[IN2:[A-Za-z0-9_]+]]: tensor<2x8xi8>,
%in2: tensor<6x8xi8>,
// CHECK-SAME: %[[DPS_OUT:[A-Za-z0-9_]+]]: tensor<4x8xi8>
%dps_out: tensor<4x8xi8>
// CHECK-SAME: -> tensor<4x8xi8> {
) -> tensor<4x8xi8> {
%in1_shared1 = mesh.shard %in1 to <@mesh_1d, [[], [0]]> : tensor<4x6xi8>
%in1_shared2 = mesh.shard %in1_shared1 to <@mesh_1d, [[], [0]]> annotate_for_users: tensor<4x6xi8>
%in2_shared1 = mesh.shard %in2 to <@mesh_1d, [[0]]> : tensor<6x8xi8>
%in2_shared2 = mesh.shard %in2_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<6x8xi8>
%dps_out_shared1 = mesh.shard %dps_out to <@mesh_1d, [[]]> : tensor<4x8xi8>
%dps_out_shared2 = mesh.shard %dps_out_shared1 to <@mesh_1d, [[]]> annotate_for_users: tensor<4x8xi8>
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[C0_I8:.*]] = arith.constant 0 : i8
// CHECK-DAG: %[[PROCESS_IDX:.*]] = mesh.process_multi_index on @mesh_1d axes = [0] : index
// CHECK-DAG: %[[MESH_SIZE:.*]] = mesh.mesh_shape @mesh_1d axes = [0] : index
// CHECK: %[[DPS_INIT_OPERAND_CONDITION:.*]] = arith.cmpi eq, %[[PROCESS_IDX]], %[[C0]] : index
// CHECK: %[[DPS_INIT_OPERAND:.*]] = scf.if %[[DPS_INIT_OPERAND_CONDITION]] -> (tensor<4x8xi8>) {
// CHECK: scf.yield %[[DPS_OUT]] : tensor<4x8xi8>
// CHECK: } else {
// CHECK-DAG: %[[EMPTY_TENSOR:.*]] = tensor.empty() : tensor<4x8xi8>
// CHECK: %[[NEUTRAL_ELEMENT_FILLED_TENSOR:.*]] = linalg.fill ins(%[[C0_I8]] : i8)
// CHECK-SAME: outs(%[[EMPTY_TENSOR]] : tensor<4x8xi8>) -> tensor<4x8xi8>
// CHECK: scf.yield %[[NEUTRAL_ELEMENT_FILLED_TENSOR]] : tensor<4x8xi8>
// CHECK: }
// CHECK: %[[SHARDED_MATMUL:.*]] = linalg.matmul ins(%[[IN1]], %[[IN2]] : tensor<4x2xi8>, tensor<2x8xi8>)
// CHECK-SAME: outs(%[[DPS_INIT_OPERAND]] : tensor<4x8xi8>) -> tensor<4x8xi8>
// CHECK: %[[ALL_REDUCED:.*]] = mesh.all_reduce %[[SHARDED_MATMUL]] on @mesh_1d mesh_axes = [0] : tensor<4x8xi8> -> tensor<4x8xi8>
%res = linalg.matmul ins(%in1_shared2, %in2_shared2 : tensor<4x6xi8>, tensor<6x8xi8>)
outs(%dps_out_shared2 : tensor<4x8xi8>) -> tensor<4x8xi8>
%res_shared1 = mesh.shard %res to <@mesh_1d, [[]]> : tensor<4x8xi8>
%res_shared2 = mesh.shard %res_shared1 to <@mesh_1d, [[]]> annotate_for_users: tensor<4x8xi8>
// CHECK: return %[[ALL_REDUCED]] : tensor<4x8xi8>
return %res_shared2 : tensor<4x8xi8>
}

// -----

mesh.mesh @mesh_1d(shape = 3)

// CHECK-LABEL: func @matmul_1d_mesh_static_tensors_reduction_iterator_sharding_with_partial_result
func.func @matmul_1d_mesh_static_tensors_reduction_iterator_sharding_with_partial_result(
// CHECK-SAME: %[[IN1:[A-Za-z0-9_]+]]: tensor<4x2xi8>,
%in1: tensor<4x6xi8>,
// CHECK-SAME: %[[IN2:[A-Za-z0-9_]+]]: tensor<2x8xi8>,
%in2: tensor<6x8xi8>,
// CHECK-SAME: %[[DPS_OUT:[A-Za-z0-9_]+]]: tensor<4x8xi8>
%dps_out: tensor<4x8xi8>
// CHECK-SAME: -> tensor<4x8xi8> {
) -> tensor<4x8xi8> {
%in1_shared1 = mesh.shard %in1 to <@mesh_1d, [[], [0]]> : tensor<4x6xi8>
%in1_shared2 = mesh.shard %in1_shared1 to <@mesh_1d, [[], [0]]> annotate_for_users: tensor<4x6xi8>
%in2_shared1 = mesh.shard %in2 to <@mesh_1d, [[0]]> : tensor<6x8xi8>
%in2_shared2 = mesh.shard %in2_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<6x8xi8>
%dps_out_shared1 = mesh.shard %dps_out to <@mesh_1d, [[]]> : tensor<4x8xi8>
%dps_out_shared2 = mesh.shard %dps_out_shared1 to <@mesh_1d, [[]]> annotate_for_users: tensor<4x8xi8>
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[C0_I8:.*]] = arith.constant 0 : i8
// CHECK-DAG: %[[PROCESS_IDX:.*]] = mesh.process_multi_index on @mesh_1d axes = [0] : index
// CHECK-DAG: %[[MESH_SIZE:.*]] = mesh.mesh_shape @mesh_1d axes = [0] : index
// CHECK: %[[DPS_INIT_OPERAND_CONDITION:.*]] = arith.cmpi eq, %[[PROCESS_IDX]], %[[C0]] : index
// CHECK: %[[DPS_INIT_OPERAND:.*]] = scf.if %[[DPS_INIT_OPERAND_CONDITION]] -> (tensor<4x8xi8>) {
// CHECK: scf.yield %[[DPS_OUT]] : tensor<4x8xi8>
// CHECK: } else {
// CHECK-DAG: %[[EMPTY_TENSOR:.*]] = tensor.empty() : tensor<4x8xi8>
// CHECK: %[[NEUTRAL_ELEMENT_FILLED_TENSOR:.*]] = linalg.fill ins(%[[C0_I8]] : i8)
// CHECK-SAME: outs(%[[EMPTY_TENSOR]] : tensor<4x8xi8>) -> tensor<4x8xi8>
// CHECK: scf.yield %[[NEUTRAL_ELEMENT_FILLED_TENSOR]] : tensor<4x8xi8>
// CHECK: }
// CHECK: %[[SHARDED_MATMUL:.*]] = linalg.matmul ins(%[[IN1]], %[[IN2]] : tensor<4x2xi8>, tensor<2x8xi8>)
// CHECK-SAME: outs(%[[DPS_INIT_OPERAND]] : tensor<4x8xi8>) -> tensor<4x8xi8>
%res = linalg.matmul ins(%in1_shared2, %in2_shared2 : tensor<4x6xi8>, tensor<6x8xi8>)
outs(%dps_out_shared2 : tensor<4x8xi8>) -> tensor<4x8xi8>
%res_shared1 = mesh.shard %res to <@mesh_1d, [[]], partial = sum[0]> : tensor<4x8xi8>
%res_shared2 = mesh.shard %res_shared1 to <@mesh_1d, [[]], partial = sum[0]> annotate_for_users: tensor<4x8xi8>
// CHECK: return %[[SHARDED_MATMUL]] : tensor<4x8xi8>
return %res_shared2 : tensor<4x8xi8>
}
71 changes: 70 additions & 1 deletion mlir/test/Transforms/test-legalizer.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns -verify-diagnostics %s | FileCheck %s

// CHECK: notifyOperationInserted: test.legal_op_a, was unlinked
// CHECK-NEXT: notifyOperationReplaced: test.illegal_op_a
// CHECK-NEXT: notifyOperationModified: func.return
// CHECK-NEXT: notifyOperationErased: test.illegal_op_a

// CHECK-LABEL: verifyDirectPattern
func.func @verifyDirectPattern() -> i32 {
// CHECK-NEXT: "test.legal_op_a"() <{status = "Success"}
Expand All @@ -8,6 +13,16 @@ func.func @verifyDirectPattern() -> i32 {
return %result : i32
}

// -----

// CHECK: notifyOperationInserted: test.illegal_op_e, was unlinked
// CHECK-NEXT: notifyOperationReplaced: test.illegal_op_c
// CHECK-NEXT: notifyOperationModified: func.return
// CHECK-NEXT: notifyOperationErased: test.illegal_op_c
// CHECK-NEXT: notifyOperationInserted: test.legal_op_a, was unlinked
// CHECK-NEXT: notifyOperationReplaced: test.illegal_op_e
// CHECK-NEXT: notifyOperationErased: test.illegal_op_e

// CHECK-LABEL: verifyLargerBenefit
func.func @verifyLargerBenefit() -> i32 {
// CHECK-NEXT: "test.legal_op_a"() <{status = "Success"}
Expand All @@ -16,29 +31,61 @@ func.func @verifyLargerBenefit() -> i32 {
return %result : i32
}

// -----

// CHECK: notifyOperationModified: func.func
// Note: No block insertion because this function is external and no block
// signature conversion is performed.

// CHECK-LABEL: func private @remap_input_1_to_0()
func.func private @remap_input_1_to_0(i16)

// -----

// CHECK-LABEL: func @remap_input_1_to_1(%arg0: f64)
func.func @remap_input_1_to_1(%arg0: i64) {
// CHECK-NEXT: "test.valid"{{.*}} : (f64)
"test.invalid"(%arg0) : (i64) -> ()
}

// CHECK-LABEL: func @remap_call_1_to_1(%arg0: f64)
// CHECK: func @remap_call_1_to_1(%arg0: f64)
func.func @remap_call_1_to_1(%arg0: i64) {
// CHECK-NEXT: call @remap_input_1_to_1(%arg0) : (f64) -> ()
call @remap_input_1_to_1(%arg0) : (i64) -> ()
// expected-remark@+1 {{op 'func.return' is not legalizable}}
return
}

// -----

// Block signature conversion: new block is inserted.
// CHECK: notifyBlockInserted into func.func: was unlinked

// Contents of the old block are moved to the new block.
// CHECK-NEXT: notifyOperationInserted: test.return, was linked, exact position unknown

// The new block arguments are used in "test.return".
// CHECK-NEXT: notifyOperationModified: test.return

// The old block is erased.
// CHECK-NEXT: notifyBlockErased

// The function op gets a new type attribute.
// CHECK-NEXT: notifyOperationModified: func.func

// "test.return" is replaced.
// CHECK-NEXT: notifyOperationInserted: test.return, was unlinked
// CHECK-NEXT: notifyOperationReplaced: test.return
// CHECK-NEXT: notifyOperationErased: test.return

// CHECK-LABEL: func @remap_input_1_to_N({{.*}}f16, {{.*}}f16)
func.func @remap_input_1_to_N(%arg0: f32) -> f32 {
// CHECK-NEXT: "test.return"{{.*}} : (f16, f16) -> ()
"test.return"(%arg0) : (f32) -> ()
}

// -----

// CHECK-LABEL: func @remap_input_1_to_N_remaining_use(%arg0: f16, %arg1: f16)
func.func @remap_input_1_to_N_remaining_use(%arg0: f32) {
// CHECK-NEXT: [[CAST:%.*]] = "test.cast"(%arg0, %arg1) : (f16, f16) -> f32
Expand All @@ -54,6 +101,8 @@ func.func @remap_materialize_1_to_1(%arg0: i42) {
"test.return"(%arg0) : (i42) -> ()
}

// -----

// CHECK-LABEL: func @remap_input_to_self
func.func @remap_input_to_self(%arg0: index) {
// CHECK-NOT: test.cast
Expand All @@ -68,6 +117,8 @@ func.func @remap_multi(%arg0: i64, %unused: i16, %arg1: i64) -> (i64, i64) {
"test.invalid"(%arg0, %arg1) : (i64, i64) -> ()
}

// -----

// CHECK-LABEL: func @no_remap_nested
func.func @no_remap_nested() {
// CHECK-NEXT: "foo.region"
Expand All @@ -82,6 +133,8 @@ func.func @no_remap_nested() {
return
}

// -----

// CHECK-LABEL: func @remap_moved_region_args
func.func @remap_moved_region_args() {
// CHECK-NEXT: return
Expand All @@ -96,6 +149,8 @@ func.func @remap_moved_region_args() {
return
}

// -----

// CHECK-LABEL: func @remap_cloned_region_args
func.func @remap_cloned_region_args() {
// CHECK-NEXT: return
Expand All @@ -122,6 +177,8 @@ func.func @remap_drop_region() {
return
}

// -----

// CHECK-LABEL: func @dropped_input_in_use
func.func @dropped_input_in_use(%arg: i16, %arg2: i64) {
// CHECK-NEXT: "test.cast"{{.*}} : () -> i16
Expand All @@ -130,6 +187,8 @@ func.func @dropped_input_in_use(%arg: i16, %arg2: i64) {
"work"(%arg) : (i16) -> ()
}

// -----

// CHECK-LABEL: func @up_to_date_replacement
func.func @up_to_date_replacement(%arg: i8) -> i8 {
// CHECK-NEXT: return
Expand All @@ -139,6 +198,8 @@ func.func @up_to_date_replacement(%arg: i8) -> i8 {
return %repl_2 : i8
}

// -----

// CHECK-LABEL: func @remove_foldable_op
// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: i32)
func.func @remove_foldable_op(%arg0 : i32) -> (i32) {
Expand All @@ -150,6 +211,8 @@ func.func @remove_foldable_op(%arg0 : i32) -> (i32) {
return %0 : i32
}

// -----

// CHECK-LABEL: @create_block
func.func @create_block() {
// Check that we created a block with arguments.
Expand All @@ -161,6 +224,12 @@ func.func @create_block() {
return
}

// -----

// CHECK: notifyOperationModified: test.recursive_rewrite
// CHECK-NEXT: notifyOperationModified: test.recursive_rewrite
// CHECK-NEXT: notifyOperationModified: test.recursive_rewrite

// CHECK-LABEL: @bounded_recursion
func.func @bounded_recursion() {
// CHECK: test.recursive_rewrite 0
Expand Down
28 changes: 24 additions & 4 deletions mlir/test/lib/Dialect/Test/TestPatterns.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,12 @@ struct TestPatternDriver
struct DumpNotifications : public RewriterBase::Listener {
void notifyBlockInserted(Block *block, Region *previous,
Region::iterator previousIt) override {
llvm::outs() << "notifyBlockInserted into "
<< block->getParentOp()->getName() << ": ";
llvm::outs() << "notifyBlockInserted";
if (block->getParentOp()) {
llvm::outs() << " into " << block->getParentOp()->getName() << ": ";
} else {
llvm::outs() << " into unknown op: ";
}
if (previous == nullptr) {
llvm::outs() << "was unlinked\n";
} else {
Expand All @@ -341,17 +345,28 @@ struct DumpNotifications : public RewriterBase::Listener {
if (!previous.isSet()) {
llvm::outs() << ", was unlinked\n";
} else {
if (previous.getPoint() == previous.getBlock()->end()) {
if (!previous.getPoint().getNodePtr()) {
llvm::outs() << ", was linked, exact position unknown\n";
} else if (previous.getPoint() == previous.getBlock()->end()) {
llvm::outs() << ", was last in block\n";
} else {
llvm::outs() << ", previous = " << previous.getPoint()->getName()
<< "\n";
}
}
}
void notifyBlockErased(Block *block) override {
llvm::outs() << "notifyBlockErased\n";
}
void notifyOperationErased(Operation *op) override {
llvm::outs() << "notifyOperationErased: " << op->getName() << "\n";
}
void notifyOperationModified(Operation *op) override {
llvm::outs() << "notifyOperationModified: " << op->getName() << "\n";
}
void notifyOperationReplaced(Operation *op, ValueRange values) override {
llvm::outs() << "notifyOperationReplaced: " << op->getName() << "\n";
}
};

struct TestStrictPatternDriver
Expand Down Expand Up @@ -1153,6 +1168,8 @@ struct TestLegalizePatternDriver
if (mode == ConversionMode::Partial) {
DenseSet<Operation *> unlegalizedOps;
ConversionConfig config;
DumpNotifications dumpNotifications;
config.listener = &dumpNotifications;
config.unlegalizedOps = &unlegalizedOps;
if (failed(applyPartialConversion(getOperation(), target,
std::move(patterns), config))) {
Expand All @@ -1171,8 +1188,11 @@ struct TestLegalizePatternDriver
return (bool)op->getAttrOfType<UnitAttr>("test.dynamically_legal");
});

ConversionConfig config;
DumpNotifications dumpNotifications;
config.listener = &dumpNotifications;
if (failed(applyFullConversion(getOperation(), target,
std::move(patterns)))) {
std::move(patterns), config))) {
getOperation()->emitRemark() << "applyFullConversion failed";
}
return;
Expand Down
311 changes: 311 additions & 0 deletions openmp/runtime/src/kmp_collapse.cpp

Large diffs are not rendered by default.

11 changes: 9 additions & 2 deletions openmp/runtime/src/kmp_collapse.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ enum loop_type_t : kmp_int32 {
loop_type_int64 = 7
};

// Defining loop types to handle special cases
enum nested_loop_type_t : kmp_int32 {
nested_loop_type_unkown = 0,
nested_loop_type_lower_triangular_matrix = 1,
nested_loop_type_upper_triangular_matrix = 2
};

/*!
@ingroup WORK_SHARING
* Describes the structure for rectangular nested loops.
Expand Down Expand Up @@ -124,14 +131,14 @@ struct bounds_info_t {
// It's represented in kmp_uint64, but each dimention is calculated in
// that loop IV type. Also dimentions have to be converted to those types
// when used in generated code.
typedef kmp_uint64* kmp_point_t;
typedef kmp_uint64 *kmp_point_t;

// Array: Number of loop iterations on each nesting level to achieve some point,
// in expanded space or in original space.
// OMPTODO: move from using iterations to using offsets (iterations multiplied
// by steps). For those we need to be careful with the types, as step can be
// negative, but it'll remove multiplications and divisions in several places.
typedef kmp_loop_nest_iv_t* kmp_iterations_t;
typedef kmp_loop_nest_iv_t *kmp_iterations_t;

// Internal struct with additional info:
template <typename T> struct bounds_info_internalXX_template {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// RUN: %libomp-compile-and-run
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "omp.h"

#ifndef MAX_BOUND
#define MAX_BOUND 64
#endif
#ifndef _MSC_VER
#define NO_EFFICIENCY_CHECK
#endif

/* To ensure Correctness, only valid iterations are executed and are executed
only once. Stores the number of times an iteration is executed. */
unsigned *execution_count = NULL;
/* Stores the number of iterations executed by each thread. */
unsigned *iterations_per_thread = NULL;

unsigned *Alloc(unsigned bound1, unsigned bound2) {
return (unsigned *)(malloc(bound1 * bound2 * sizeof(unsigned)));
}

void ZeroOut(unsigned *p, unsigned bound1, unsigned bound2) {
memset(p, 0, bound1 * bound2 * sizeof(unsigned));
}

void Free(unsigned *p) { free((void *)p); }

unsigned *Index(unsigned *p, unsigned i, unsigned j, unsigned bound2) {
return &p[i * bound2 + j];
}

int test(unsigned upper_bound) {

unsigned total_iterations = upper_bound * (upper_bound - 1) / 2;
unsigned num_threads = omp_get_max_threads();
unsigned lower_per_chunk = total_iterations / num_threads;
unsigned upper_per_chunk =
lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
int i, j;

omp_set_num_threads(num_threads);

ZeroOut(execution_count, upper_bound, upper_bound);
ZeroOut(iterations_per_thread, num_threads, 1);

#ifdef VERBOSE
fprintf(stderr,
"INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] "
"chunks "
"loop type lower triangle <,< - ",
num_threads, upper_bound, total_iterations, lower_per_chunk,
upper_per_chunk);
#endif

#pragma omp parallel shared(iterations_per_thread, execution_count)
{ /* begin of parallel */
/* Lower triangular execution_count matrix */
#pragma omp for schedule(static) collapse(2)
for (i = 0; i < upper_bound; i++) {
for (j = 0; j < i; j++) {
(*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
(*Index(execution_count, i, j, upper_bound))++;
}
} /* end of for*/
} /* end of parallel */

/* check the execution_count array */
for (i = 0; i < upper_bound; i++) {
for (j = 0; j < i; j++) {
unsigned value = *Index(execution_count, i, j, upper_bound);
/* iteration with j<=i are valid, but should have been executed only once
*/
if (value != 1) {
fprintf(stderr, "ERROR: valid iteration [%i,%i] executed %i times.\n",
i, j, value);
return 0;
}
}
for (j = i; j < upper_bound; j++) {
unsigned value = *Index(execution_count, i, j, upper_bound);
/* iteration with j>=i are invalid and should not have been executed
*/
if (value > 0) {
fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n",
i, j, value);
return 0;
}
}
}

#ifndef NO_EFFICIENCY_CHECK
/* Ensure the number of iterations executed by each thread is within bounds */
for (i = 0; i < num_threads; i++) {
unsigned value = *Index(iterations_per_thread, i, 0, 1);
if (value < lower_per_chunk || value > upper_per_chunk) {
fprintf(stderr,
"ERROR: Inefficient Collapse thread %d of %d assigned %i "
"iterations; must be between %d and %d\n",
i, num_threads, value, lower_per_chunk, upper_per_chunk);
return 0;
}
}
#endif
#ifdef VERBOSE
fprintf(stderr, "PASSED\r\n");
#endif
return 1;
}

int main() {

execution_count = Alloc(MAX_BOUND, MAX_BOUND);
iterations_per_thread = Alloc(omp_get_max_threads(), 1);

for (unsigned j = 0; j < MAX_BOUND; j++) {
if (!test(j))
return 1;
}
Free(execution_count);
Free(iterations_per_thread);
return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// RUN: %libomp-compile-and-run
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "omp.h"

#ifndef MAX_BOUND
#define MAX_BOUND 64
#endif
#ifndef _MSC_VER
#define NO_EFFICIENCY_CHECK
#endif

/* To ensure Correctness, only valid iterations are executed and are executed
only once. Stores the number of times an iteration is executed. */
unsigned *execution_count = NULL;
/* Stores the number of iterations executed by each thread. */
unsigned *iterations_per_thread = NULL;

unsigned *Alloc(unsigned bound1, unsigned bound2) {
return (unsigned *)(malloc(bound1 * bound2 * sizeof(unsigned)));
}

void ZeroOut(unsigned *p, unsigned bound1, unsigned bound2) {
memset(p, 0, bound1 * bound2 * sizeof(unsigned));
}

void Free(unsigned *p) { free((void *)p); }

unsigned *Index(unsigned *p, unsigned i, unsigned j, unsigned bound2) {
return &p[i * bound2 + j];
}

int test(int upper_bound) {

unsigned total_iterations = upper_bound * (upper_bound + 1) / 2;
unsigned num_threads = omp_get_max_threads();
unsigned lower_per_chunk = total_iterations / num_threads;
unsigned upper_per_chunk =
lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
int i, j;

omp_set_num_threads(num_threads);

ZeroOut(execution_count, upper_bound, upper_bound);
ZeroOut(iterations_per_thread, num_threads, 1);

#ifdef VERBOSE
fprintf(stderr,
"INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] "
"chunks "
"loop type lower triangle <,<= - ",
num_threads, upper_bound, total_iterations, lower_per_chunk,
upper_per_chunk);
#endif

#pragma omp parallel shared(iterations_per_thread, execution_count)
{ /* begin of parallel */
/* Lower triangular execution_count matrix */
#pragma omp for schedule(static) collapse(2)
for (i = 0; i < upper_bound; i++) {
for (j = 0; j <= i; j++) {
(*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
(*Index(execution_count, i, j, upper_bound))++;
}
} /* end of for*/
} /* end of parallel */

/* check the execution_count array */
for (i = 0; i < upper_bound; i++) {
for (j = 0; j <= i; j++) {
unsigned value = *Index(execution_count, i, j, upper_bound);
/* iteration with j<=i are valid, but should have been executed only once
*/
if (value != 1) {
fprintf(stderr, "ERROR: valid iteration [%i,%i] executed %i times.\n",
i, j, value);
return 0;
}
}
for (j = i + 1; j < upper_bound; j++) {
unsigned value = *Index(execution_count, i, j, upper_bound);
/* iteration with j>=i are invalid and should not have been executed
*/
if (value > 0) {
fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n",
i, j, value);
return 0;
}
}
}

#ifndef NO_EFFICIENCY_CHECK
/* Ensure the number of iterations executed by each thread is within bounds */
for (i = 0; i < num_threads; i++) {
unsigned value = *Index(iterations_per_thread, i, 0, 1);
if (value < lower_per_chunk || value > upper_per_chunk) {
fprintf(stderr,
"ERROR: Inefficient Collapse thread %d of %d assigned %i "
"iterations; must be between %d and %d\n",
i, num_threads, value, lower_per_chunk, upper_per_chunk);
return 0;
}
}
#endif
#ifdef VERBOSE
fprintf(stderr, "PASSED\r\n");
#endif
return 1;
}

int main() {

execution_count = Alloc(MAX_BOUND, MAX_BOUND);
iterations_per_thread = Alloc(omp_get_max_threads(), 1);

for (unsigned j = 0; j < MAX_BOUND; j++) {
if (!test(j))
return 1;
}
Free(execution_count);
Free(iterations_per_thread);
return 0;
}
124 changes: 124 additions & 0 deletions openmp/runtime/test/worksharing/for/omp_for_collapse_UpperTriangular.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// RUN: %libomp-compile-and-run
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "omp.h"

#ifndef MAX_BOUND
#define MAX_BOUND 64
#endif
#ifndef _MSC_VER
#define NO_EFFICIENCY_CHECK
#endif

/* To ensure Correctness, only valid iterations are executed and are executed
only once. Stores the number of times an iteration is executed. */
unsigned *execution_count = NULL;
/* Stores the number of iterations executed by each thread. */
unsigned *iterations_per_thread = NULL;

unsigned *Alloc(unsigned bound1, unsigned bound2) {
return (unsigned *)(malloc(bound1 * bound2 * sizeof(unsigned)));
}

void ZeroOut(unsigned *p, unsigned bound1, unsigned bound2) {
memset(p, 0, bound1 * bound2 * sizeof(unsigned));
}

void Free(unsigned *p) { free((void *)p); }

unsigned *Index(unsigned *p, unsigned i, unsigned j, unsigned bound2) {
return &p[i * bound2 + j];
}

int test(unsigned upper_bound) {

unsigned total_iterations = upper_bound * (upper_bound + 1) / 2;
unsigned num_threads = omp_get_max_threads();
unsigned lower_per_chunk = total_iterations / num_threads;
unsigned upper_per_chunk =
lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
int i, j;

omp_set_num_threads(num_threads);

ZeroOut(execution_count, upper_bound, upper_bound);
ZeroOut(iterations_per_thread, num_threads, 1);

#ifdef VERBOSE
fprintf(stderr,
"INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] "
"chunks "
"loop type upper triangle <,< - ",
num_threads, upper_bound, total_iterations, lower_per_chunk,
upper_per_chunk);
#endif

#pragma omp parallel shared(iterations_per_thread, execution_count)
{ /* begin of parallel */
/* Lower triangular execution_count matrix */
#pragma omp for schedule(static) collapse(2)
for (i = 0; i < upper_bound; i++) {
for (j = i; j < upper_bound; j++) {
(*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
(*Index(execution_count, i, j, upper_bound))++;
}
} /* end of for*/
} /* end of parallel */

/* check the execution_count array */
for (i = 0; i < upper_bound; i++) {
for (j = i; j < upper_bound; j++) {
unsigned value = *Index(execution_count, i, j, upper_bound);
/* iteration with j<=i are valid, but should have been executed only once
*/
if (value != 1) {
fprintf(stderr, "ERROR: valid iteration [%i,%i] executed %i times.\n",
i, j, value);
return 0;
}
}
for (j = 0; j < i; j++) {
unsigned value = *Index(execution_count, i, j, upper_bound);
/* iteration with j>=i are invalid and should not have been executed
*/
if (value > 0) {
fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n",
i, j, value);
return 0;
}
}
}

#ifndef NO_EFFICIENCY_CHECK
/* Ensure the number of iterations executed by each thread is within bounds */
for (i = 0; i < num_threads; i++) {
unsigned value = *Index(iterations_per_thread, i, 0, 1);
if (value < lower_per_chunk || value > upper_per_chunk) {
fprintf(stderr,
"ERROR: Inefficient Collapse thread %d of %d assigned %i "
"iterations; must be between %d and %d\n",
i, num_threads, value, lower_per_chunk, upper_per_chunk);
return 0;
}
}
#endif
#ifdef VERBOSE
fprintf(stderr, "PASSED\r\n");
#endif
return 1;
}

int main() {

execution_count = Alloc(MAX_BOUND, MAX_BOUND);
iterations_per_thread = Alloc(omp_get_max_threads(), 1);

for (unsigned j = 0; j < MAX_BOUND; j++) {
if (!test(j))
return 1;
}
Free(execution_count);
Free(iterations_per_thread);
return 0;
}
4 changes: 4 additions & 0 deletions utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -10841,6 +10841,7 @@ cc_library(
":MemRefDialect",
":Parser",
":SCFDialect",
":MeshShardingInterface",
":SideEffectInterfaces",
":SparseTensorDialect",
":Support",
Expand Down Expand Up @@ -10994,10 +10995,13 @@ cc_library(
":MathDialect",
":MemRefDialect",
":MemRefTransforms",
":MeshDialect",
":MeshTransforms",
":Pass",
":SCFDialect",
":SCFTransforms",
":SCFUtils",
":MeshShardingInterface",
":SparseTensorDialect",
":SubsetOpInterface",
":Support",
Expand Down