215 changes: 215 additions & 0 deletions clang/test/OpenMP/irbuilder_unroll_unroll_partial_factor.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=51 -x c -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
// expected-no-diagnostics

#ifndef HEADER
#define HEADER

// CHECK-LABEL: define {{.*}}@unroll_partial_factor_for(
// CHECK-NEXT: [[ENTRY:.*]]:
// CHECK-NEXT: %[[A_ADDR:.+]] = alloca float*, align 8
// CHECK-NEXT: %[[B_ADDR:.+]] = alloca float*, align 8
// CHECK-NEXT: %[[C_ADDR:.+]] = alloca float*, align 8
// CHECK-NEXT: %[[D_ADDR:.+]] = alloca float*, align 8
// CHECK-NEXT: %[[I:.+]] = alloca i32, align 4
// CHECK-NEXT: %[[AGG_CAPTURED:.+]] = alloca %struct.anon, align 8
// CHECK-NEXT: %[[AGG_CAPTURED1:.+]] = alloca %struct.anon.0, align 4
// CHECK-NEXT: %[[DOTCOUNT_ADDR:.+]] = alloca i32, align 4
// CHECK-NEXT: %[[P_LASTITER:.+]] = alloca i32, align 4
// CHECK-NEXT: %[[P_LOWERBOUND:.+]] = alloca i32, align 4
// CHECK-NEXT: %[[P_UPPERBOUND:.+]] = alloca i32, align 4
// CHECK-NEXT: %[[P_STRIDE:.+]] = alloca i32, align 4
// CHECK-NEXT: store float* %[[A:.+]], float** %[[A_ADDR]], align 8
// CHECK-NEXT: store float* %[[B:.+]], float** %[[B_ADDR]], align 8
// CHECK-NEXT: store float* %[[C:.+]], float** %[[C_ADDR]], align 8
// CHECK-NEXT: store float* %[[D:.+]], float** %[[D_ADDR]], align 8
// CHECK-NEXT: store i32 0, i32* %[[I]], align 4
// CHECK-NEXT: %[[TMP0:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[AGG_CAPTURED]], i32 0, i32 0
// CHECK-NEXT: store i32* %[[I]], i32** %[[TMP0]], align 8
// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[AGG_CAPTURED1]], i32 0, i32 0
// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[I]], align 4
// CHECK-NEXT: store i32 %[[TMP2]], i32* %[[TMP1]], align 4
// CHECK-NEXT: call void @__captured_stmt(i32* %[[DOTCOUNT_ADDR]], %struct.anon* %[[AGG_CAPTURED]])
// CHECK-NEXT: %[[DOTCOUNT:.+]] = load i32, i32* %[[DOTCOUNT_ADDR]], align 4
// CHECK-NEXT: br label %[[OMP_LOOP_PREHEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_LOOP_PREHEADER]]:
// CHECK-NEXT: %[[TMP3:.+]] = udiv i32 %[[DOTCOUNT]], 2
// CHECK-NEXT: %[[TMP4:.+]] = urem i32 %[[DOTCOUNT]], 2
// CHECK-NEXT: %[[TMP5:.+]] = icmp ne i32 %[[TMP4]], 0
// CHECK-NEXT: %[[TMP6:.+]] = zext i1 %[[TMP5]] to i32
// CHECK-NEXT: %[[OMP_FLOOR0_TRIPCOUNT:.+]] = add nuw i32 %[[TMP3]], %[[TMP6]]
// CHECK-NEXT: br label %[[OMP_FLOOR0_PREHEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_PREHEADER]]:
// CHECK-NEXT: store i32 0, i32* %[[P_LOWERBOUND]], align 4
// CHECK-NEXT: %[[TMP7:.+]] = sub i32 %[[OMP_FLOOR0_TRIPCOUNT]], 1
// CHECK-NEXT: store i32 %[[TMP7]], i32* %[[P_UPPERBOUND]], align 4
// CHECK-NEXT: store i32 1, i32* %[[P_STRIDE]], align 4
// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* %[[P_LASTITER]], i32* %[[P_LOWERBOUND]], i32* %[[P_UPPERBOUND]], i32* %[[P_STRIDE]], i32 1, i32 1)
// CHECK-NEXT: %[[TMP8:.+]] = load i32, i32* %[[P_LOWERBOUND]], align 4
// CHECK-NEXT: %[[TMP9:.+]] = load i32, i32* %[[P_UPPERBOUND]], align 4
// CHECK-NEXT: %[[TMP10:.+]] = sub i32 %[[TMP9]], %[[TMP8]]
// CHECK-NEXT: %[[TMP11:.+]] = add i32 %[[TMP10]], 1
// CHECK-NEXT: br label %[[OMP_FLOOR0_HEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_HEADER]]:
// CHECK-NEXT: %[[OMP_FLOOR0_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR0_PREHEADER]] ], [ %[[OMP_FLOOR0_NEXT:.+]], %[[OMP_FLOOR0_INC:.+]] ]
// CHECK-NEXT: br label %[[OMP_FLOOR0_COND:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_COND]]:
// CHECK-NEXT: %[[OMP_FLOOR0_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR0_IV]], %[[TMP11]]
// CHECK-NEXT: br i1 %[[OMP_FLOOR0_CMP]], label %[[OMP_FLOOR0_BODY:.+]], label %[[OMP_FLOOR0_EXIT:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_BODY]]:
// CHECK-NEXT: %[[TMP12:.+]] = add i32 %[[OMP_FLOOR0_IV]], %[[TMP8]]
// CHECK-NEXT: %[[TMP13:.+]] = icmp eq i32 %[[TMP12]], %[[OMP_FLOOR0_TRIPCOUNT]]
// CHECK-NEXT: %[[TMP14:.+]] = select i1 %[[TMP13]], i32 %[[TMP4]], i32 2
// CHECK-NEXT: br label %[[OMP_TILE0_PREHEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_PREHEADER]]:
// CHECK-NEXT: br label %[[OMP_TILE0_HEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_HEADER]]:
// CHECK-NEXT: %[[OMP_TILE0_IV:.+]] = phi i32 [ 0, %[[OMP_TILE0_PREHEADER]] ], [ %[[OMP_TILE0_NEXT:.+]], %[[OMP_TILE0_INC:.+]] ]
// CHECK-NEXT: br label %[[OMP_TILE0_COND:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_COND]]:
// CHECK-NEXT: %[[OMP_TILE0_CMP:.+]] = icmp ult i32 %[[OMP_TILE0_IV]], %[[TMP14]]
// CHECK-NEXT: br i1 %[[OMP_TILE0_CMP]], label %[[OMP_TILE0_BODY:.+]], label %[[OMP_TILE0_EXIT:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_BODY]]:
// CHECK-NEXT: %[[TMP15:.+]] = mul nuw i32 2, %[[TMP12]]
// CHECK-NEXT: %[[TMP16:.+]] = add nuw i32 %[[TMP15]], %[[OMP_TILE0_IV]]
// CHECK-NEXT: br label %[[OMP_LOOP_BODY:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_LOOP_BODY]]:
// CHECK-NEXT: call void @__captured_stmt.1(i32* %[[I]], i32 %[[TMP16]], %struct.anon.0* %[[AGG_CAPTURED1]])
// CHECK-NEXT: %[[TMP17:.+]] = load float*, float** %[[B_ADDR]], align 8
// CHECK-NEXT: %[[TMP18:.+]] = load i32, i32* %[[I]], align 4
// CHECK-NEXT: %[[IDXPROM:.+]] = sext i32 %[[TMP18]] to i64
// CHECK-NEXT: %[[ARRAYIDX:.+]] = getelementptr inbounds float, float* %[[TMP17]], i64 %[[IDXPROM]]
// CHECK-NEXT: %[[TMP19:.+]] = load float, float* %[[ARRAYIDX]], align 4
// CHECK-NEXT: %[[TMP20:.+]] = load float*, float** %[[C_ADDR]], align 8
// CHECK-NEXT: %[[TMP21:.+]] = load i32, i32* %[[I]], align 4
// CHECK-NEXT: %[[IDXPROM2:.+]] = sext i32 %[[TMP21]] to i64
// CHECK-NEXT: %[[ARRAYIDX3:.+]] = getelementptr inbounds float, float* %[[TMP20]], i64 %[[IDXPROM2]]
// CHECK-NEXT: %[[TMP22:.+]] = load float, float* %[[ARRAYIDX3]], align 4
// CHECK-NEXT: %[[MUL:.+]] = fmul float %[[TMP19]], %[[TMP22]]
// CHECK-NEXT: %[[TMP23:.+]] = load float*, float** %[[D_ADDR]], align 8
// CHECK-NEXT: %[[TMP24:.+]] = load i32, i32* %[[I]], align 4
// CHECK-NEXT: %[[IDXPROM4:.+]] = sext i32 %[[TMP24]] to i64
// CHECK-NEXT: %[[ARRAYIDX5:.+]] = getelementptr inbounds float, float* %[[TMP23]], i64 %[[IDXPROM4]]
// CHECK-NEXT: %[[TMP25:.+]] = load float, float* %[[ARRAYIDX5]], align 4
// CHECK-NEXT: %[[MUL6:.+]] = fmul float %[[MUL]], %[[TMP25]]
// CHECK-NEXT: %[[TMP26:.+]] = load float*, float** %[[A_ADDR]], align 8
// CHECK-NEXT: %[[TMP27:.+]] = load i32, i32* %[[I]], align 4
// CHECK-NEXT: %[[IDXPROM7:.+]] = sext i32 %[[TMP27]] to i64
// CHECK-NEXT: %[[ARRAYIDX8:.+]] = getelementptr inbounds float, float* %[[TMP26]], i64 %[[IDXPROM7]]
// CHECK-NEXT: store float %[[MUL6]], float* %[[ARRAYIDX8]], align 4
// CHECK-NEXT: br label %[[OMP_TILE0_INC]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_INC]]:
// CHECK-NEXT: %[[OMP_TILE0_NEXT]] = add nuw i32 %[[OMP_TILE0_IV]], 1
// CHECK-NEXT: br label %[[OMP_TILE0_HEADER]], !llvm.loop ![[LOOP3:[0-9]+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_EXIT]]:
// CHECK-NEXT: br label %[[OMP_TILE0_AFTER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_AFTER]]:
// CHECK-NEXT: br label %[[OMP_FLOOR0_INC]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_INC]]:
// CHECK-NEXT: %[[OMP_FLOOR0_NEXT]] = add nuw i32 %[[OMP_FLOOR0_IV]], 1
// CHECK-NEXT: br label %[[OMP_FLOOR0_HEADER]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_EXIT]]:
// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %[[OMP_GLOBAL_THREAD_NUM]])
// CHECK-NEXT: %[[OMP_GLOBAL_THREAD_NUM9:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* @2, i32 %[[OMP_GLOBAL_THREAD_NUM9]])
// CHECK-NEXT: br label %[[OMP_FLOOR0_AFTER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_AFTER]]:
// CHECK-NEXT: br label %[[OMP_LOOP_AFTER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_LOOP_AFTER]]:
// CHECK-NEXT: ret void
// CHECK-NEXT: }

void unroll_partial_factor_for(float *a, float *b, float *c, float *d) {
#pragma omp for
#pragma omp unroll partial(2)
for (int i = 0; i < 2; i++) {
a[i] = b[i] * c[i] * d[i];
}
}

#endif // HEADER

// CHECK-LABEL: define {{.*}}@__captured_stmt(
// CHECK-NEXT: [[ENTRY:.*]]:
// CHECK-NEXT: %[[DISTANCE_ADDR:.+]] = alloca i32*, align 8
// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon*, align 8
// CHECK-NEXT: %[[DOTSTART:.+]] = alloca i32, align 4
// CHECK-NEXT: %[[DOTSTOP:.+]] = alloca i32, align 4
// CHECK-NEXT: %[[DOTSTEP:.+]] = alloca i32, align 4
// CHECK-NEXT: store i32* %[[DISTANCE:.+]], i32** %[[DISTANCE_ADDR]], align 8
// CHECK-NEXT: store %struct.anon* %[[__CONTEXT:.+]], %struct.anon** %[[__CONTEXT_ADDR]], align 8
// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon*, %struct.anon** %[[__CONTEXT_ADDR]], align 8
// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[TMP0]], i32 0, i32 0
// CHECK-NEXT: %[[TMP2:.+]] = load i32*, i32** %[[TMP1]], align 8
// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[TMP2]], align 4
// CHECK-NEXT: store i32 %[[TMP3]], i32* %[[DOTSTART]], align 4
// CHECK-NEXT: store i32 2, i32* %[[DOTSTOP]], align 4
// CHECK-NEXT: store i32 1, i32* %[[DOTSTEP]], align 4
// CHECK-NEXT: %[[TMP4:.+]] = load i32, i32* %[[DOTSTART]], align 4
// CHECK-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTSTOP]], align 4
// CHECK-NEXT: %[[CMP:.+]] = icmp slt i32 %[[TMP4]], %[[TMP5]]
// CHECK-NEXT: br i1 %[[CMP]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[COND_TRUE]]:
// CHECK-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTSTOP]], align 4
// CHECK-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTSTART]], align 4
// CHECK-NEXT: %[[SUB:.+]] = sub nsw i32 %[[TMP6]], %[[TMP7]]
// CHECK-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTSTEP]], align 4
// CHECK-NEXT: %[[DIV:.+]] = udiv i32 %[[SUB]], %[[TMP8]]
// CHECK-NEXT: br label %[[COND_END:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[COND_FALSE]]:
// CHECK-NEXT: br label %[[COND_END]]
// CHECK-EMPTY:
// CHECK-NEXT: [[COND_END]]:
// CHECK-NEXT: %[[COND:.+]] = phi i32 [ %[[DIV]], %[[COND_TRUE]] ], [ 0, %[[COND_FALSE]] ]
// CHECK-NEXT: %[[TMP9:.+]] = load i32*, i32** %[[DISTANCE_ADDR]], align 8
// CHECK-NEXT: store i32 %[[COND]], i32* %[[TMP9]], align 4
// CHECK-NEXT: ret void
// CHECK-NEXT: }


// CHECK-LABEL: define {{.*}}@__captured_stmt.1(
// CHECK-NEXT: [[ENTRY:.*]]:
// CHECK-NEXT: %[[LOOPVAR_ADDR:.+]] = alloca i32*, align 8
// CHECK-NEXT: %[[LOGICAL_ADDR:.+]] = alloca i32, align 4
// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon.0*, align 8
// CHECK-NEXT: store i32* %[[LOOPVAR:.+]], i32** %[[LOOPVAR_ADDR]], align 8
// CHECK-NEXT: store i32 %[[LOGICAL:.+]], i32* %[[LOGICAL_ADDR]], align 4
// CHECK-NEXT: store %struct.anon.0* %[[__CONTEXT:.+]], %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon.0*, %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[TMP0]], i32 0, i32 0
// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[TMP1]], align 4
// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[LOGICAL_ADDR]], align 4
// CHECK-NEXT: %[[MUL:.+]] = mul i32 1, %[[TMP3]]
// CHECK-NEXT: %[[ADD:.+]] = add i32 %[[TMP2]], %[[MUL]]
// CHECK-NEXT: %[[TMP4:.+]] = load i32*, i32** %[[LOOPVAR_ADDR]], align 8
// CHECK-NEXT: store i32 %[[ADD]], i32* %[[TMP4]], align 4
// CHECK-NEXT: ret void
// CHECK-NEXT: }


// CHECK: ![[META0:[0-9]+]] = !{i32 1, !"wchar_size", i32 4}
// CHECK: ![[META1:[0-9]+]] = !{i32 7, !"openmp", i32 51}
// CHECK: ![[META2:[0-9]+]] =
// CHECK: ![[LOOP3]] = distinct !{![[LOOP3]], ![[LOOPPROP4:[0-9]+]], ![[LOOPPROP5:[0-9]+]]}
// CHECK: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.enable"}
// CHECK: ![[LOOPPROP5]] = !{!"llvm.loop.unroll.count", i32 2}
197 changes: 197 additions & 0 deletions clang/test/OpenMP/irbuilder_unroll_unroll_partial_heuristic.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=51 -x c -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
// expected-no-diagnostics

#ifndef HEADER
#define HEADER

// CHECK-LABEL: define {{.*}}@unroll_unroll_partial_heuristic(
// CHECK-NEXT: [[ENTRY:.*]]:
// CHECK-NEXT: %[[A_ADDR:.+]] = alloca float*, align 8
// CHECK-NEXT: %[[B_ADDR:.+]] = alloca float*, align 8
// CHECK-NEXT: %[[C_ADDR:.+]] = alloca float*, align 8
// CHECK-NEXT: %[[D_ADDR:.+]] = alloca float*, align 8
// CHECK-NEXT: %[[I:.+]] = alloca i32, align 4
// CHECK-NEXT: %[[AGG_CAPTURED:.+]] = alloca %struct.anon, align 8
// CHECK-NEXT: %[[AGG_CAPTURED1:.+]] = alloca %struct.anon.0, align 4
// CHECK-NEXT: %[[DOTCOUNT_ADDR:.+]] = alloca i32, align 4
// CHECK-NEXT: store float* %[[A:.+]], float** %[[A_ADDR]], align 8
// CHECK-NEXT: store float* %[[B:.+]], float** %[[B_ADDR]], align 8
// CHECK-NEXT: store float* %[[C:.+]], float** %[[C_ADDR]], align 8
// CHECK-NEXT: store float* %[[D:.+]], float** %[[D_ADDR]], align 8
// CHECK-NEXT: store i32 0, i32* %[[I]], align 4
// CHECK-NEXT: %[[TMP0:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[AGG_CAPTURED]], i32 0, i32 0
// CHECK-NEXT: store i32* %[[I]], i32** %[[TMP0]], align 8
// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[AGG_CAPTURED1]], i32 0, i32 0
// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[I]], align 4
// CHECK-NEXT: store i32 %[[TMP2]], i32* %[[TMP1]], align 4
// CHECK-NEXT: call void @__captured_stmt(i32* %[[DOTCOUNT_ADDR]], %struct.anon* %[[AGG_CAPTURED]])
// CHECK-NEXT: %[[DOTCOUNT:.+]] = load i32, i32* %[[DOTCOUNT_ADDR]], align 4
// CHECK-NEXT: br label %[[OMP_LOOP_PREHEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_LOOP_PREHEADER]]:
// CHECK-NEXT: %[[TMP3:.+]] = udiv i32 %[[DOTCOUNT]], 8
// CHECK-NEXT: %[[TMP4:.+]] = urem i32 %[[DOTCOUNT]], 8
// CHECK-NEXT: %[[TMP5:.+]] = icmp ne i32 %[[TMP4]], 0
// CHECK-NEXT: %[[TMP6:.+]] = zext i1 %[[TMP5]] to i32
// CHECK-NEXT: %[[OMP_FLOOR0_TRIPCOUNT:.+]] = add nuw i32 %[[TMP3]], %[[TMP6]]
// CHECK-NEXT: br label %[[OMP_FLOOR0_PREHEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_PREHEADER]]:
// CHECK-NEXT: br label %[[OMP_FLOOR0_HEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_HEADER]]:
// CHECK-NEXT: %[[OMP_FLOOR0_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR0_PREHEADER]] ], [ %[[OMP_FLOOR0_NEXT:.+]], %[[OMP_FLOOR0_INC:.+]] ]
// CHECK-NEXT: br label %[[OMP_FLOOR0_COND:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_COND]]:
// CHECK-NEXT: %[[OMP_FLOOR0_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR0_IV]], %[[OMP_FLOOR0_TRIPCOUNT]]
// CHECK-NEXT: br i1 %[[OMP_FLOOR0_CMP]], label %[[OMP_FLOOR0_BODY:.+]], label %[[OMP_FLOOR0_EXIT:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_BODY]]:
// CHECK-NEXT: %[[TMP7:.+]] = icmp eq i32 %[[OMP_FLOOR0_IV]], %[[OMP_FLOOR0_TRIPCOUNT]]
// CHECK-NEXT: %[[TMP8:.+]] = select i1 %[[TMP7]], i32 %[[TMP4]], i32 8
// CHECK-NEXT: br label %[[OMP_TILE0_PREHEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_PREHEADER]]:
// CHECK-NEXT: br label %[[OMP_TILE0_HEADER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_HEADER]]:
// CHECK-NEXT: %[[OMP_TILE0_IV:.+]] = phi i32 [ 0, %[[OMP_TILE0_PREHEADER]] ], [ %[[OMP_TILE0_NEXT:.+]], %[[OMP_TILE0_INC:.+]] ]
// CHECK-NEXT: br label %[[OMP_TILE0_COND:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_COND]]:
// CHECK-NEXT: %[[OMP_TILE0_CMP:.+]] = icmp ult i32 %[[OMP_TILE0_IV]], %[[TMP8]]
// CHECK-NEXT: br i1 %[[OMP_TILE0_CMP]], label %[[OMP_TILE0_BODY:.+]], label %[[OMP_TILE0_EXIT:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_BODY]]:
// CHECK-NEXT: %[[TMP9:.+]] = mul nuw i32 8, %[[OMP_FLOOR0_IV]]
// CHECK-NEXT: %[[TMP10:.+]] = add nuw i32 %[[TMP9]], %[[OMP_TILE0_IV]]
// CHECK-NEXT: br label %[[OMP_LOOP_BODY:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_LOOP_BODY]]:
// CHECK-NEXT: call void @__captured_stmt.1(i32* %[[I]], i32 %[[TMP10]], %struct.anon.0* %[[AGG_CAPTURED1]])
// CHECK-NEXT: %[[TMP11:.+]] = load float*, float** %[[B_ADDR]], align 8
// CHECK-NEXT: %[[TMP12:.+]] = load i32, i32* %[[I]], align 4
// CHECK-NEXT: %[[IDXPROM:.+]] = sext i32 %[[TMP12]] to i64
// CHECK-NEXT: %[[ARRAYIDX:.+]] = getelementptr inbounds float, float* %[[TMP11]], i64 %[[IDXPROM]]
// CHECK-NEXT: %[[TMP13:.+]] = load float, float* %[[ARRAYIDX]], align 4
// CHECK-NEXT: %[[TMP14:.+]] = load float*, float** %[[C_ADDR]], align 8
// CHECK-NEXT: %[[TMP15:.+]] = load i32, i32* %[[I]], align 4
// CHECK-NEXT: %[[IDXPROM2:.+]] = sext i32 %[[TMP15]] to i64
// CHECK-NEXT: %[[ARRAYIDX3:.+]] = getelementptr inbounds float, float* %[[TMP14]], i64 %[[IDXPROM2]]
// CHECK-NEXT: %[[TMP16:.+]] = load float, float* %[[ARRAYIDX3]], align 4
// CHECK-NEXT: %[[MUL:.+]] = fmul float %[[TMP13]], %[[TMP16]]
// CHECK-NEXT: %[[TMP17:.+]] = load float*, float** %[[D_ADDR]], align 8
// CHECK-NEXT: %[[TMP18:.+]] = load i32, i32* %[[I]], align 4
// CHECK-NEXT: %[[IDXPROM4:.+]] = sext i32 %[[TMP18]] to i64
// CHECK-NEXT: %[[ARRAYIDX5:.+]] = getelementptr inbounds float, float* %[[TMP17]], i64 %[[IDXPROM4]]
// CHECK-NEXT: %[[TMP19:.+]] = load float, float* %[[ARRAYIDX5]], align 4
// CHECK-NEXT: %[[MUL6:.+]] = fmul float %[[MUL]], %[[TMP19]]
// CHECK-NEXT: %[[TMP20:.+]] = load float*, float** %[[A_ADDR]], align 8
// CHECK-NEXT: %[[TMP21:.+]] = load i32, i32* %[[I]], align 4
// CHECK-NEXT: %[[IDXPROM7:.+]] = sext i32 %[[TMP21]] to i64
// CHECK-NEXT: %[[ARRAYIDX8:.+]] = getelementptr inbounds float, float* %[[TMP20]], i64 %[[IDXPROM7]]
// CHECK-NEXT: store float %[[MUL6]], float* %[[ARRAYIDX8]], align 4
// CHECK-NEXT: br label %[[OMP_TILE0_INC]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_INC]]:
// CHECK-NEXT: %[[OMP_TILE0_NEXT]] = add nuw i32 %[[OMP_TILE0_IV]], 1
// CHECK-NEXT: br label %[[OMP_TILE0_HEADER]], !llvm.loop ![[LOOP3:[0-9]+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_EXIT]]:
// CHECK-NEXT: br label %[[OMP_TILE0_AFTER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_TILE0_AFTER]]:
// CHECK-NEXT: br label %[[OMP_FLOOR0_INC]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_INC]]:
// CHECK-NEXT: %[[OMP_FLOOR0_NEXT]] = add nuw i32 %[[OMP_FLOOR0_IV]], 1
// CHECK-NEXT: br label %[[OMP_FLOOR0_HEADER]], !llvm.loop ![[LOOP6:[0-9]+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_EXIT]]:
// CHECK-NEXT: br label %[[OMP_FLOOR0_AFTER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_FLOOR0_AFTER]]:
// CHECK-NEXT: br label %[[OMP_LOOP_AFTER:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[OMP_LOOP_AFTER]]:
// CHECK-NEXT: ret void
// CHECK-NEXT: }
void unroll_unroll_partial_heuristic(float *a, float *b, float *c, float *d) {
#pragma omp unroll partial
#pragma omp unroll partial
for (int i = 0; i < 2; i++) {
a[i] = b[i] * c[i] * d[i];
}
}

#endif // HEADER

// CHECK-LABEL: define {{.*}}@__captured_stmt(
// CHECK-NEXT: [[ENTRY:.*]]:
// CHECK-NEXT: %[[DISTANCE_ADDR:.+]] = alloca i32*, align 8
// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon*, align 8
// CHECK-NEXT: %[[DOTSTART:.+]] = alloca i32, align 4
// CHECK-NEXT: %[[DOTSTOP:.+]] = alloca i32, align 4
// CHECK-NEXT: %[[DOTSTEP:.+]] = alloca i32, align 4
// CHECK-NEXT: store i32* %[[DISTANCE:.+]], i32** %[[DISTANCE_ADDR]], align 8
// CHECK-NEXT: store %struct.anon* %[[__CONTEXT:.+]], %struct.anon** %[[__CONTEXT_ADDR]], align 8
// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon*, %struct.anon** %[[__CONTEXT_ADDR]], align 8
// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon, %struct.anon* %[[TMP0]], i32 0, i32 0
// CHECK-NEXT: %[[TMP2:.+]] = load i32*, i32** %[[TMP1]], align 8
// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[TMP2]], align 4
// CHECK-NEXT: store i32 %[[TMP3]], i32* %[[DOTSTART]], align 4
// CHECK-NEXT: store i32 2, i32* %[[DOTSTOP]], align 4
// CHECK-NEXT: store i32 1, i32* %[[DOTSTEP]], align 4
// CHECK-NEXT: %[[TMP4:.+]] = load i32, i32* %[[DOTSTART]], align 4
// CHECK-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTSTOP]], align 4
// CHECK-NEXT: %[[CMP:.+]] = icmp slt i32 %[[TMP4]], %[[TMP5]]
// CHECK-NEXT: br i1 %[[CMP]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[COND_TRUE]]:
// CHECK-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTSTOP]], align 4
// CHECK-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTSTART]], align 4
// CHECK-NEXT: %[[SUB:.+]] = sub nsw i32 %[[TMP6]], %[[TMP7]]
// CHECK-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTSTEP]], align 4
// CHECK-NEXT: %[[DIV:.+]] = udiv i32 %[[SUB]], %[[TMP8]]
// CHECK-NEXT: br label %[[COND_END:.+]]
// CHECK-EMPTY:
// CHECK-NEXT: [[COND_FALSE]]:
// CHECK-NEXT: br label %[[COND_END]]
// CHECK-EMPTY:
// CHECK-NEXT: [[COND_END]]:
// CHECK-NEXT: %[[COND:.+]] = phi i32 [ %[[DIV]], %[[COND_TRUE]] ], [ 0, %[[COND_FALSE]] ]
// CHECK-NEXT: %[[TMP9:.+]] = load i32*, i32** %[[DISTANCE_ADDR]], align 8
// CHECK-NEXT: store i32 %[[COND]], i32* %[[TMP9]], align 4
// CHECK-NEXT: ret void
// CHECK-NEXT: }


// CHECK-LABEL: define {{.*}}@__captured_stmt.1(
// CHECK-NEXT: [[ENTRY:.*]]:
// CHECK-NEXT: %[[LOOPVAR_ADDR:.+]] = alloca i32*, align 8
// CHECK-NEXT: %[[LOGICAL_ADDR:.+]] = alloca i32, align 4
// CHECK-NEXT: %[[__CONTEXT_ADDR:.+]] = alloca %struct.anon.0*, align 8
// CHECK-NEXT: store i32* %[[LOOPVAR:.+]], i32** %[[LOOPVAR_ADDR]], align 8
// CHECK-NEXT: store i32 %[[LOGICAL:.+]], i32* %[[LOGICAL_ADDR]], align 4
// CHECK-NEXT: store %struct.anon.0* %[[__CONTEXT:.+]], %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
// CHECK-NEXT: %[[TMP0:.+]] = load %struct.anon.0*, %struct.anon.0** %[[__CONTEXT_ADDR]], align 8
// CHECK-NEXT: %[[TMP1:.+]] = getelementptr inbounds %struct.anon.0, %struct.anon.0* %[[TMP0]], i32 0, i32 0
// CHECK-NEXT: %[[TMP2:.+]] = load i32, i32* %[[TMP1]], align 4
// CHECK-NEXT: %[[TMP3:.+]] = load i32, i32* %[[LOGICAL_ADDR]], align 4
// CHECK-NEXT: %[[MUL:.+]] = mul i32 1, %[[TMP3]]
// CHECK-NEXT: %[[ADD:.+]] = add i32 %[[TMP2]], %[[MUL]]
// CHECK-NEXT: %[[TMP4:.+]] = load i32*, i32** %[[LOOPVAR_ADDR]], align 8
// CHECK-NEXT: store i32 %[[ADD]], i32* %[[TMP4]], align 4
// CHECK-NEXT: ret void
// CHECK-NEXT: }


// CHECK: ![[META0:[0-9]+]] = !{i32 1, !"wchar_size", i32 4}
// CHECK: ![[META1:[0-9]+]] = !{i32 7, !"openmp", i32 51}
// CHECK: ![[META2:[0-9]+]] =
// CHECK: ![[LOOP3]] = distinct !{![[LOOP3]], ![[LOOPPROP4:[0-9]+]], ![[LOOPPROP5:[0-9]+]]}
// CHECK: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.enable"}
// CHECK: ![[LOOPPROP5]] = !{!"llvm.loop.unroll.count", i32 8}
// CHECK: ![[LOOP6]] = distinct !{![[LOOP6]], ![[LOOPPROP4]]}
4 changes: 4 additions & 0 deletions llvm/include/llvm/Analysis/LoopInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1305,6 +1305,10 @@ bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name);
llvm::Optional<int>
getOptionalIntLoopAttribute(const Loop *TheLoop, StringRef Name);

/// Find named metadata for a loop with an integer value. Return \p Default if
/// not set.
int getIntLoopAttribute(const Loop *TheLoop, StringRef Name, int Default = 0);

/// Find string metadata for loop
///
/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
Expand Down
42 changes: 42 additions & 0 deletions llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,48 @@ class OpenMPIRBuilder {
tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
ArrayRef<Value *> TileSizes);

/// Fully unroll a loop.
///
/// Instead of unrolling the loop immediately (and duplicating its body
/// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
/// metadata.
///
/// \param DL Debug location for instructions added by unrolling.
/// \param Loop The loop to unroll. The loop will be invalidated.
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop);

/// Fully or partially unroll a loop. How the loop is unrolled is determined
/// using LLVM's LoopUnrollPass.
///
/// \param DL Debug location for instructions added by unrolling.
/// \param Loop The loop to unroll. The loop will be invalidated.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop);

/// Partially unroll a loop.
///
/// The CanonicalLoopInfo of the unrolled loop for use with chained
/// loop-associated directive can be requested using \p UnrolledCLI. Not
/// needing the CanonicalLoopInfo allows more efficient code generation by
/// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
/// A loop-associated directive applied to the unrolled loop needs to know the
/// new trip count which means that if using a heuristically determined unroll
/// factor (\p Factor == 0), that factor must be computed immediately. We are
/// using the same logic as the LoopUnrollPass to derived the unroll factor,
/// but which assumes that some canonicalization has taken place (e.g.
/// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
/// better when the unrolled loop's CanonicalLoopInfo is not needed.
///
/// \param DL Debug location for instructions added by unrolling.
/// \param Loop The loop to unroll. The loop will be invalidated.
/// \param Factor The factor to unroll the loop by. A factor of 0
/// indicates that a heuristic should be used to determine
/// the unroll-factor.
/// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
/// partially unrolled loop. Otherwise, uses loop metadata
/// to defer unrolling to the LoopUnrollPass.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor,
CanonicalLoopInfo **UnrolledCLI);

/// Generator for '#omp flush'
///
/// \param Loc The location where the flush directive was encountered
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Analysis/LoopInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1102,6 +1102,11 @@ llvm::Optional<int> llvm::getOptionalIntLoopAttribute(const Loop *TheLoop,
return IntMD->getSExtValue();
}

int llvm::getIntLoopAttribute(const Loop *TheLoop, StringRef Name,
int Default) {
return getOptionalIntLoopAttribute(TheLoop, Name).getValueOr(Default);
}

static const char *LLVMLoopMustProgress = "llvm.loop.mustprogress";

bool llvm::hasMustProgress(const Loop *L) {
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Frontend/OpenMP/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,7 @@ add_llvm_component_library(LLVMFrontendOpenMP
Core
Support
TransformUtils
Analysis
MC
Scalar
)
299 changes: 298 additions & 1 deletion llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,28 @@
//===----------------------------------------------------------------------===//

#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"

#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/CodeExtractor.h"
#include "llvm/Transforms/Utils/LoopPeel.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"

#include <sstream>

Expand All @@ -39,6 +49,12 @@ static cl::opt<bool>
"'as-if' properties of runtime calls."),
cl::init(false));

static cl::opt<double> UnrollThresholdFactor(
"openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
cl::desc("Factor for the unroll threshold to account for code "
"simplifications still taking place"),
cl::init(1.5));

void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
LLVMContext &Ctx = Fn.getContext();

Expand Down Expand Up @@ -2056,6 +2072,287 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
return Result;
}

/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
/// loop already has metadata, the loop properties are appended.
static void addLoopMetadata(CanonicalLoopInfo *Loop,
ArrayRef<Metadata *> Properties) {
assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");

// Nothing to do if no property to attach.
if (Properties.empty())
return;

LLVMContext &Ctx = Loop->getFunction()->getContext();
SmallVector<Metadata *> NewLoopProperties;
NewLoopProperties.push_back(nullptr);

// If the loop already has metadata, prepend it to the new metadata.
BasicBlock *Latch = Loop->getLatch();
assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
MDNode *Existing = Latch->getTerminator()->getMetadata(LLVMContext::MD_loop);
if (Existing)
append_range(NewLoopProperties, drop_begin(Existing->operands(), 1));

append_range(NewLoopProperties, Properties);
MDNode *LoopID = MDNode::getDistinct(Ctx, NewLoopProperties);
LoopID->replaceOperandWith(0, LoopID);

Latch->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID);
}

void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
LLVMContext &Ctx = Builder.getContext();
addLoopMetadata(
Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
}

void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
LLVMContext &Ctx = Builder.getContext();
addLoopMetadata(
Loop, {
MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
});
}

/// Create the TargetMachine object to query the backend for optimization
/// preferences.
///
/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
/// needed for the LLVM pass pipline. We use some default options to avoid
/// having to pass too many settings from the frontend that probably do not
/// matter.
///
/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
/// method. If we are going to use TargetMachine for more purposes, especially
/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
/// might become be worth requiring front-ends to pass on their TargetMachine,
/// or at least cache it between methods. Note that while fontends such as Clang
/// have just a single main TargetMachine per translation unit, "target-cpu" and
/// "target-features" that determine the TargetMachine are per-function and can
/// be overrided using __attribute__((target("OPTIONS"))).
static std::unique_ptr<TargetMachine>
createTargetMachine(Function *F, CodeGenOpt::Level OptLevel) {
Module *M = F->getParent();

StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
StringRef Features = F->getFnAttribute("target-features").getValueAsString();
const std::string &Triple = M->getTargetTriple();

std::string Error;
const llvm::Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error);
if (!TheTarget)
return {};

llvm::TargetOptions Options;
return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
Triple, CPU, Features, Options, /*RelocModel=*/None, /*CodeModel=*/None,
OptLevel));
}

/// Heuristically determine the best-performant unroll factor for \p CLI. This
/// depends on the target processor. We are re-using the same heuristics as the
/// LoopUnrollPass.
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
Function *F = CLI->getFunction();

// Assume the user requests the most aggressive unrolling, even if the rest of
// the code is optimized using a lower setting.
CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive;
std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);

FunctionAnalysisManager FAM;
FAM.registerPass([]() { return TargetLibraryAnalysis(); });
FAM.registerPass([]() { return AssumptionAnalysis(); });
FAM.registerPass([]() { return DominatorTreeAnalysis(); });
FAM.registerPass([]() { return LoopAnalysis(); });
FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
TargetIRAnalysis TIRA;
if (TM)
TIRA = TargetIRAnalysis(
[&](const Function &F) { return TM->getTargetTransformInfo(F); });
FAM.registerPass([&]() { return TIRA; });

TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
ScalarEvolutionAnalysis SEA;
ScalarEvolution &&SE = SEA.run(*F, FAM);
DominatorTreeAnalysis DTA;
DominatorTree &&DT = DTA.run(*F, FAM);
LoopAnalysis LIA;
LoopInfo &&LI = LIA.run(*F, FAM);
AssumptionAnalysis ACT;
AssumptionCache &&AC = ACT.run(*F, FAM);
OptimizationRemarkEmitter ORE{F};

Loop *L = LI.getLoopFor(CLI->getHeader());
assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");

TargetTransformInfo::UnrollingPreferences UP =
gatherUnrollingPreferences(L, SE, TTI,
/*BlockFrequencyInfo=*/nullptr,
/*ProfileSummaryInfo=*/nullptr, ORE, OptLevel,
/*UserThreshold=*/None,
/*UserCount=*/None,
/*UserAllowPartial=*/true,
/*UserAllowRuntime=*/true,
/*UserUpperBound=*/None,
/*UserFullUnrollMaxCount=*/None);

UP.Force = true;

// Account for additional optimizations taking place before the LoopUnrollPass
// would unroll the loop.
UP.Threshold *= UnrollThresholdFactor;
UP.PartialThreshold *= UnrollThresholdFactor;

// Use normal unroll factors even if the rest of the code is optimized for
// size.
UP.OptSizeThreshold = UP.Threshold;
UP.PartialOptSizeThreshold = UP.PartialThreshold;

LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
<< " Threshold=" << UP.Threshold << "\n"
<< " PartialThreshold=" << UP.PartialThreshold << "\n"
<< " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
<< " PartialOptSizeThreshold="
<< UP.PartialOptSizeThreshold << "\n");

// Disable peeling.
TargetTransformInfo::PeelingPreferences PP =
gatherPeelingPreferences(L, SE, TTI,
/*UserAllowPeeling=*/false,
/*UserAllowProfileBasedPeeling=*/false,
/*UserUnrollingSpecficValues=*/false);

SmallPtrSet<const Value *, 32> EphValues;
CodeMetrics::collectEphemeralValues(L, &AC, EphValues);

// Assume that reads and writes to stack variables can be eliminated by
// Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
// size.
for (BasicBlock *BB : L->blocks()) {
for (Instruction &I : *BB) {
Value *Ptr;
if (auto *Load = dyn_cast<LoadInst>(&I)) {
Ptr = Load->getPointerOperand();
} else if (auto *Store = dyn_cast<StoreInst>(&I)) {
Ptr = Store->getPointerOperand();
} else
continue;

Ptr = Ptr->stripPointerCasts();

if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
if (Alloca->getParent() == &F->getEntryBlock())
EphValues.insert(&I);
}
}
}

unsigned NumInlineCandidates;
bool NotDuplicatable;
bool Convergent;
unsigned LoopSize =
ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
TTI, EphValues, UP.BEInsns);
LLVM_DEBUG(dbgs() << "Estimated loop size is " << LoopSize << "\n");

// Loop is not unrollable if the loop contains certain instructions.
if (NotDuplicatable || Convergent) {
LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
return 1;
}

// TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
// be able to use it.
int TripCount = 0;
int MaxTripCount = 0;
bool MaxOrZero = false;
unsigned TripMultiple = 0;

bool UseUpperBound = false;
computeUnrollCount(L, TTI, DT, &LI, SE, EphValues, &ORE, TripCount,
MaxTripCount, MaxOrZero, TripMultiple, LoopSize, UP, PP,
UseUpperBound);
unsigned Factor = UP.Count;
LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");

// This function returns 1 to signal to not unroll a loop.
if (Factor == 0)
return 1;
return Factor;
}

void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
int32_t Factor,
CanonicalLoopInfo **UnrolledCLI) {
assert(Factor >= 0 && "Unroll factor must not be negative");

Function *F = Loop->getFunction();
LLVMContext &Ctx = F->getContext();

// If the unrolled loop is not used for another loop-associated directive, it
// is sufficient to add metadata for the LoopUnrollPass.
if (!UnrolledCLI) {
SmallVector<Metadata *, 2> LoopMetadata;
LoopMetadata.push_back(
MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));

if (Factor >= 1) {
ConstantAsMetadata *FactorConst = ConstantAsMetadata::get(
ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
LoopMetadata.push_back(MDNode::get(
Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
}

addLoopMetadata(Loop, LoopMetadata);
return;
}

// Heuristically determine the unroll factor.
if (Factor == 0)
Factor = computeHeuristicUnrollFactor(Loop);

// No change required with unroll factor 1.
if (Factor == 1) {
*UnrolledCLI = Loop;
return;
}

assert(Factor >= 2 &&
"unrolling only makes sense with a factor of 2 or larger");

Type *IndVarTy = Loop->getIndVarType();

// Apply partial unrolling by tiling the loop by the unroll-factor, then fully
// unroll the inner loop.
Value *FactorVal =
ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
/*isSigned=*/false));
std::vector<CanonicalLoopInfo *> LoopNest =
tileLoops(DL, {Loop}, {FactorVal});
assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
*UnrolledCLI = LoopNest[0];
CanonicalLoopInfo *InnerLoop = LoopNest[1];

// LoopUnrollPass can only fully unroll loops with constant trip count.
// Unroll by the unroll factor with a fallback epilog for the remainder
// iterations if necessary.
ConstantAsMetadata *FactorConst = ConstantAsMetadata::get(
ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
addLoopMetadata(
InnerLoop,
{MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
MDNode::get(
Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});

#ifndef NDEBUG
(*UnrolledCLI)->assertOK();
#endif
}

OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
llvm::Value *BufSize, llvm::Value *CpyBuf,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/IPO/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,5 @@ add_llvm_component_library(LLVMipo
TransformUtils
Vectorize
Instrumentation
Scalar
)
140 changes: 118 additions & 22 deletions llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "gtest/gtest.h"

Expand Down Expand Up @@ -142,6 +143,40 @@ class OpenMPIRBuilderTest : public testing::Test {
M.reset();
}

/// Create a function with a simple loop that calls printf using the logical
/// loop counter for use with tests that need a CanonicalLoopInfo object.
CanonicalLoopInfo *buildSingleLoopFunction(DebugLoc DL,
OpenMPIRBuilder &OMPBuilder,
Instruction **Call = nullptr,
BasicBlock **BodyCode = nullptr) {
OMPBuilder.initialize();
F->setName("func");

IRBuilder<> Builder(BB);
OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
Value *TripCount = F->getArg(0);

auto LoopBodyGenCB = [&](OpenMPIRBuilder::InsertPointTy CodeGenIP,
llvm::Value *LC) {
Builder.restoreIP(CodeGenIP);
if (BodyCode)
*BodyCode = Builder.GetInsertBlock();

// Add something that consumes the induction variable to the body.
CallInst *CallInst = createPrintfCall(Builder, "%d\\n", {LC});
if (Call)
*Call = CallInst;
};
CanonicalLoopInfo *Loop =
OMPBuilder.createCanonicalLoop(Loc, LoopBodyGenCB, TripCount);

// Finalize the function.
Builder.restoreIP(Loop->getAfterIP());
Builder.CreateRetVoid();

return Loop;
}

LLVMContext Ctx;
std::unique_ptr<Module> M;
Function *F;
Expand Down Expand Up @@ -1288,30 +1323,11 @@ TEST_F(OpenMPIRBuilderTest, CollapseNestedLoops) {
}

TEST_F(OpenMPIRBuilderTest, TileSingleLoop) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
OMPBuilder.initialize();
F->setName("func");

IRBuilder<> Builder(BB);
OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
Value *TripCount = F->getArg(0);

BasicBlock *BodyCode = nullptr;
Instruction *Call = nullptr;
auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, llvm::Value *LC) {
Builder.restoreIP(CodeGenIP);
BodyCode = Builder.GetInsertBlock();

// Add something that consumes the induction variable to the body.
Call = createPrintfCall(Builder, "%d\\n", {LC});
};
Instruction *Call;
BasicBlock *BodyCode;
CanonicalLoopInfo *Loop =
OMPBuilder.createCanonicalLoop(Loc, LoopBodyGenCB, TripCount);

// Finalize the function.
Builder.restoreIP(Loop->getAfterIP());
Builder.CreateRetVoid();
buildSingleLoopFunction(DL, OMPBuilder, &Call, &BodyCode);

Instruction *OrigIndVar = Loop->getIndVar();
EXPECT_EQ(Call->getOperand(1), OrigIndVar);
Expand Down Expand Up @@ -1648,6 +1664,86 @@ TEST_F(OpenMPIRBuilderTest, TileSingleLoopCounts) {
EXPECT_FALSE(verifyModule(*M, &errs()));
}

TEST_F(OpenMPIRBuilderTest, UnrollLoopFull) {
OpenMPIRBuilder OMPBuilder(*M);

CanonicalLoopInfo *CLI = buildSingleLoopFunction(DL, OMPBuilder);

// Unroll the loop.
OMPBuilder.unrollLoopFull(DL, CLI);

OMPBuilder.finalize();
EXPECT_FALSE(verifyModule(*M, &errs()));

PassBuilder PB;
FunctionAnalysisManager FAM;
PB.registerFunctionAnalyses(FAM);
LoopInfo &LI = FAM.getResult<LoopAnalysis>(*F);

const std::vector<Loop *> &TopLvl = LI.getTopLevelLoops();
EXPECT_EQ(TopLvl.size(), 1u);

Loop *L = TopLvl.front();
EXPECT_TRUE(getBooleanLoopAttribute(L, "llvm.loop.unroll.enable"));
EXPECT_TRUE(getBooleanLoopAttribute(L, "llvm.loop.unroll.full"));
}

TEST_F(OpenMPIRBuilderTest, UnrollLoopPartial) {
OpenMPIRBuilder OMPBuilder(*M);
CanonicalLoopInfo *CLI = buildSingleLoopFunction(DL, OMPBuilder);

// Unroll the loop.
CanonicalLoopInfo *UnrolledLoop = nullptr;
OMPBuilder.unrollLoopPartial(DL, CLI, 5, &UnrolledLoop);
ASSERT_NE(UnrolledLoop, nullptr);

OMPBuilder.finalize();
EXPECT_FALSE(verifyModule(*M, &errs()));
UnrolledLoop->assertOK();

PassBuilder PB;
FunctionAnalysisManager FAM;
PB.registerFunctionAnalyses(FAM);
LoopInfo &LI = FAM.getResult<LoopAnalysis>(*F);

const std::vector<Loop *> &TopLvl = LI.getTopLevelLoops();
EXPECT_EQ(TopLvl.size(), 1u);
Loop *Outer = TopLvl.front();
EXPECT_EQ(Outer->getHeader(), UnrolledLoop->getHeader());
EXPECT_EQ(Outer->getLoopLatch(), UnrolledLoop->getLatch());
EXPECT_EQ(Outer->getExitingBlock(), UnrolledLoop->getCond());
EXPECT_EQ(Outer->getExitBlock(), UnrolledLoop->getExit());

EXPECT_EQ(Outer->getSubLoops().size(), 1u);
Loop *Inner = Outer->getSubLoops().front();

EXPECT_TRUE(getBooleanLoopAttribute(Inner, "llvm.loop.unroll.enable"));
EXPECT_EQ(getIntLoopAttribute(Inner, "llvm.loop.unroll.count"), 5);
}

TEST_F(OpenMPIRBuilderTest, UnrollLoopHeuristic) {
OpenMPIRBuilder OMPBuilder(*M);

CanonicalLoopInfo *CLI = buildSingleLoopFunction(DL, OMPBuilder);

// Unroll the loop.
OMPBuilder.unrollLoopHeuristic(DL, CLI);

OMPBuilder.finalize();
EXPECT_FALSE(verifyModule(*M, &errs()));

PassBuilder PB;
FunctionAnalysisManager FAM;
PB.registerFunctionAnalyses(FAM);
LoopInfo &LI = FAM.getResult<LoopAnalysis>(*F);

const std::vector<Loop *> &TopLvl = LI.getTopLevelLoops();
EXPECT_EQ(TopLvl.size(), 1u);

Loop *L = TopLvl.front();
EXPECT_TRUE(getBooleanLoopAttribute(L, "llvm.loop.unroll.enable"));
}

TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
Expand Down