diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index c3f60d7f60eff..eb21bbde8d9d4 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1274,9 +1274,11 @@ emitCombinerOrInitializer(CodeGenModule &CGM, QualType Ty, auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage, Name, &CGM.getModule()); CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, FnInfo); - Fn->removeFnAttr(llvm::Attribute::NoInline); - Fn->removeFnAttr(llvm::Attribute::OptimizeNone); - Fn->addFnAttr(llvm::Attribute::AlwaysInline); + if (CGM.getLangOpts().Optimize) { + Fn->removeFnAttr(llvm::Attribute::NoInline); + Fn->removeFnAttr(llvm::Attribute::OptimizeNone); + Fn->addFnAttr(llvm::Attribute::AlwaysInline); + } CodeGenFunction CGF(CGM); // Map "T omp_in;" variable to "*omp_in_parm" value in all expressions. // Map "T omp_out;" variable to "*omp_out_parm" value in all expressions. @@ -4671,9 +4673,11 @@ emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc, &CGM.getModule()); CGM.SetInternalFunctionAttributes(GlobalDecl(), TaskPrivatesMap, TaskPrivatesMapFnInfo); - TaskPrivatesMap->removeFnAttr(llvm::Attribute::NoInline); - TaskPrivatesMap->removeFnAttr(llvm::Attribute::OptimizeNone); - TaskPrivatesMap->addFnAttr(llvm::Attribute::AlwaysInline); + if (CGM.getLangOpts().Optimize) { + TaskPrivatesMap->removeFnAttr(llvm::Attribute::NoInline); + TaskPrivatesMap->removeFnAttr(llvm::Attribute::OptimizeNone); + TaskPrivatesMap->addFnAttr(llvm::Attribute::AlwaysInline); + } CodeGenFunction CGF(CGM); CGF.StartFunction(GlobalDecl(), C.VoidTy, TaskPrivatesMap, TaskPrivatesMapFnInfo, Args, Loc, Loc); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index ca1e9311b6b4f..e6f9d971d7dfb 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -1929,6 +1929,11 @@ llvm::Function *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( auto *OutlinedFun = cast(CGOpenMPRuntime::emitParallelOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen)); + if (CGM.getLangOpts().Optimize) { + OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); + OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone); + OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); + } IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion; IsInTTDRegion = PrevIsInTTDRegion; if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD && @@ -2045,9 +2050,11 @@ llvm::Function *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( CodeGen.setAction(Action); llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen); - OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); - OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone); - OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); + if (CGM.getLangOpts().Optimize) { + OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); + OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone); + OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); + } return OutlinedFun; } @@ -3422,6 +3429,12 @@ static llvm::Function *emitShuffleAndReduceFunction( "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule()); CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); Fn->setDoesNotRecurse(); + if (CGM.getLangOpts().Optimize) { + Fn->removeFnAttr(llvm::Attribute::NoInline); + Fn->removeFnAttr(llvm::Attribute::OptimizeNone); + Fn->addFnAttr(llvm::Attribute::AlwaysInline); + } + CodeGenFunction CGF(CGM); CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); diff --git a/clang/test/OpenMP/declare_target_codegen_globalization.cpp b/clang/test/OpenMP/declare_target_codegen_globalization.cpp index 06d7e5ce64565..04882d346cc46 100644 --- a/clang/test/OpenMP/declare_target_codegen_globalization.cpp +++ b/clang/test/OpenMP/declare_target_codegen_globalization.cpp @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s // expected-no-diagnostics int foo(int &a) { return a; } diff --git a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp index 8cb88f61c1055..0eee597d0232b 100644 --- a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp +++ b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER @@ -52,9 +52,8 @@ void init_plus(BaseS1&, const BaseS1&); // CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { %{{[^,]+}}, %{{[^,]+}}, float } // CHECK-DAG: [[S_INT_TY:%.+]] = type { %{{[^,]+}}, %{{[^,]+}}, i{{[0-9]+}} } -// CHECK-DAG: [[ATOMIC_REDUCE_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8* -// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8* -// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8* +// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8* +// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 18, i32 0, i32 0, i8* // CHECK-DAG: [[REDUCTION_LOCK:@.+]] = common global [8 x i32] zeroinitializer #pragma omp declare reduction(operator&& : int : omp_out = 111 & omp_in) @@ -173,13 +172,13 @@ int main() { // CHECK: [[T_VAR1_REF:%.+]] = load float*, float** % // For + reduction operation initial value of private variable is -1. -// CHECK: store float -1.0{{.+}}, float* +// CHECK: call void [[RED_INIT1:@.+]](float* %{{.+}}, float* %{{.+}}) // For & reduction operation initial value of private variable is defined by call of 'init()' function. -// CHECK: call {{.*}}void @_Z4initR6BaseS1RKS_( +// CHECK: call void [[RED_INIT2:@.+]]( // For && reduction operation initial value of private variable is 1.0. -// CHECK: call {{.*}}void @_Z5init1R6BaseS1RKS_( +// CHECK: call void [[RED_INIT3:@.+]]( // For min reduction operation initial value of private variable is largest repesentable value. // CHECK: [[INIT:%.+]] = load float, float* @ @@ -219,16 +218,16 @@ int main() { // case 1: // t_var += t_var_reduction; -// CHECK: fsub float 2.220000e+02, % +// CHECK: call void [[RED_COMB1:@.+]](float* %{{.+}}, float* %{{.+}}) // var = var.operator &(var_reduction); -// CHECK: call {{.*}}void @_Z3redR6BaseS1RKS_( +// CHECK: call void [[RED_COMB2:@.+]]( // var1 = var1.operator &&(var1_reduction); -// CHECK: fmul float +// CHECK: call void [[RED_COMB3:@.+]]( // t_var1 = min(t_var1, t_var1_reduction); -// CHECK: fadd float 5.550000e+02, % +// CHECK: call void [[RED_COMB4:@.+]]( // __kmpc_end_reduce(, , &); // CHECK: call void @__kmpc_end_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]]) @@ -239,22 +238,22 @@ int main() { // case 2: // t_var += t_var_reduction; // CHECK: call void @__kmpc_critical( -// CHECK: fsub float 2.220000e+02, % +// CHECK: call void [[RED_COMB1]](float* %{{.+}}, float* %{{.+}}) // CHECK: call void @__kmpc_end_critical( // var = var.operator &(var_reduction); // CHECK: call void @__kmpc_critical( -// CHECK: call {{.*}}void @_Z3redR6BaseS1RKS_( +// CHECK: call void [[RED_COMB2]] // CHECK: call void @__kmpc_end_critical( // var1 = var1.operator &&(var1_reduction); // CHECK: call void @__kmpc_critical( -// CHECK: fmul float +// CHECK: call void [[RED_COMB3]] // CHECK: call void @__kmpc_end_critical( // t_var1 = min(t_var1, t_var1_reduction); // CHECK: call void @__kmpc_critical( -// CHECK: fadd float 5.550000e+02, % +// CHECK: call void [[RED_COMB4]] // CHECK: call void @__kmpc_end_critical( // __kmpc_end_reduce(, , &); @@ -269,6 +268,24 @@ int main() { // CHECK: ret void +// CHECK: define internal void [[RED_COMB1]](float* noalias, float* noalias) +// CHECK: fsub float 2.220000e+02, % + +// CHECK: define internal void [[RED_INIT1]](float* noalias, float* noalias) +// CHECK: store float -1.0{{.+}}, float* + +// CHECK: define internal void [[RED_COMB2]]( +// CHECK: call {{.*}}void @_Z3redR6BaseS1RKS_( + +// CHECK: define internal void [[RED_INIT2]]( +// CHECK: call {{.*}}void @_Z4initR6BaseS1RKS_( + +// CHECK: define internal void [[RED_COMB3]]( +// CHECK: fmul float + +// CHECK: define internal void [[RED_INIT3]]( +// CHECK: call {{.*}}void @_Z5init1R6BaseS1RKS_( + // void reduce_func(void *lhs[], void *rhs[]) { // *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]); // ... @@ -313,18 +330,21 @@ int main() { // CHECK: [[T_VAR1_LHS:%.+]] = bitcast i8* [[T_VAR1_LHS_VOID]] to float* // t_var_lhs += t_var_rhs; -// CHECK: fsub float 2.220000e+02, % +// CHECK: call void [[RED_COMB1]](float* %{{.+}}, float* %{{.+}}) // var_lhs = var_lhs.operator &(var_rhs); -// CHECK: call {{.*}}void @_Z3redR6BaseS1RKS_( +// CHECK: call void [[RED_COMB2]]( // var1_lhs = var1_lhs.operator &&(var1_rhs); -// CHECK: fmul float +// CHECK: call void [[RED_COMB3]]( // t_var1_lhs = min(t_var1_lhs, t_var1_rhs); -// CHECK: fadd float 5.550000e+02, % +// CHECK: call void [[RED_COMB4]]( // CHECK: ret void +// CHECK: define internal void [[RED_COMB4]]( +// CHECK: fadd float 5.550000e+02, % + // CHECK: define internal void [[MAIN_MICROTASK1]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* {{.+}} %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [10 x [4 x [[S_FLOAT_TY]]]]* dereferenceable(480) %{{.+}}) // Reduction list for runtime. @@ -348,7 +368,7 @@ int main() { // CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_PRIV]], [[END]] // CHECK: br i1 [[ISEMPTY]], // CHECK: phi i32* -// CHECK: store i32 888, i32* % +// CHECK: call void [[RED_INIT5:@.+]](i32* %{{.+}}, i32* %{{.+}}) // CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -359,7 +379,7 @@ int main() { // CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_PRIV]], [[END]] // CHECK: br i1 [[ISEMPTY]], // CHECK: phi [[S_FLOAT_TY]]* -// CHECK: call void @_Z4initR6BaseS1RKS_(% +// CHECK: call void [[RED_INIT2]]( // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -405,8 +425,7 @@ int main() { // CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]] // CHECK: br i1 [[ISEMPTY]], // CHECK: phi i32* -// CHECK: [[ADD:%.+]] = mul nsw i32 555, % -// CHECK: store i32 [[ADD]], i32* % +// CHECK: call void [[RED_COMB5:@.+]](i32* %{{.+}}, i32* %{{.+}}) // CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -415,7 +434,7 @@ int main() { // CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]] // CHECK: br i1 [[ISEMPTY]], // CHECK: phi [[S_FLOAT_TY]]* -// CHECK: call void @_Z3redR6BaseS1RKS_(% +// CHECK: call void [[RED_COMB2]]( // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -434,7 +453,7 @@ int main() { // CHECK: br i1 [[ISEMPTY]], // CHECK: phi i32* // CHECK: call void @__kmpc_critical( -// CHECK: [[ADD:%.+]] = mul nsw i32 555, % +// CHECK: call void [[RED_COMB5]]( // CHECK: call void @__kmpc_end_critical( // CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -445,7 +464,7 @@ int main() { // CHECK: br i1 [[ISEMPTY]], // CHECK: phi [[S_FLOAT_TY]]* // CHECK: call void @__kmpc_critical( -// CHECK: call void @_Z3redR6BaseS1RKS_(% +// CHECK: call void [[RED_COMB2]]( // CHECK: call void @__kmpc_end_critical( // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -466,6 +485,12 @@ int main() { // CHECK: ret void +// CHECK: define internal void [[RED_COMB5]](i32* noalias, i32* noalias) +// CHECK: mul nsw i32 555, % + +// CHECK: define internal void [[RED_INIT5]](i32* noalias, i32* noalias) +// CHECK: store i32 888, i32* % + // void reduce_func(void *lhs[], void *rhs[]) { // *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]); // ... @@ -506,7 +531,7 @@ int main() { // CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_LHS]], [[END]] // CHECK: br i1 [[ISEMPTY]], // CHECK: phi i32* -// CHECK: [[ADD:%.+]] = mul nsw i32 555, % +// CHECK: call void [[RED_COMB5]]( // CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -515,7 +540,7 @@ int main() { // CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]] // CHECK: br i1 [[ISEMPTY]], // CHECK: phi [[S_FLOAT_TY]]* -// CHECK: call void @_Z3redR6BaseS1RKS_(% +// CHECK: call void [[RED_COMB2]]( // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -539,7 +564,7 @@ int main() { // CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_PRIV]], [[END]] // CHECK: br i1 [[ISEMPTY]], // CHECK: phi i32* -// CHECK: store i32 888, i32* % +// CHECK: call void [[RED_INIT5]]( // CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -550,7 +575,7 @@ int main() { // CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[BEGIN]], [[END]] // CHECK: br i1 [[ISEMPTY]], // CHECK: phi [[S_FLOAT_TY]]* -// CHECK: call void @_Z4initR6BaseS1RKS_(% +// CHECK: call void [[RED_INIT2]]( // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], // CHECK: [[LHS_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* %{{.+}} to [[S_FLOAT_TY]]* @@ -595,8 +620,7 @@ int main() { // CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]] // CHECK: br i1 [[ISEMPTY]], // CHECK: phi i32* -// CHECK: [[ADD:%[^ ]+]] = mul nsw i32 555, % -// CHECK: store i32 [[ADD]], i32* % +// CHECK: call void [[RED_COMB5]]( // CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -605,7 +629,7 @@ int main() { // CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[LHS_BEGIN]], [[END]] // CHECK: br i1 [[ISEMPTY]], // CHECK: phi [[S_FLOAT_TY]]* -// CHECK: call void @_Z3redR6BaseS1RKS_(% +// CHECK: call void [[RED_COMB2]]( // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -624,7 +648,7 @@ int main() { // CHECK: br i1 [[ISEMPTY]], // CHECK: phi i32* // CHECK: call void @__kmpc_critical( -// CHECK: [[ADD:%.+]] = mul nsw i32 555, % +// CHECK: call void [[RED_COMB5]]( // CHECK: call void @__kmpc_end_critical( // CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -635,7 +659,7 @@ int main() { // CHECK: br i1 [[ISEMPTY]], // CHECK: phi [[S_FLOAT_TY]]* // CHECK: call void @__kmpc_critical( -// CHECK: call void @_Z3redR6BaseS1RKS_(% +// CHECK: call void [[RED_COMB2]]( // CHECK: call void @__kmpc_end_critical( // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -692,8 +716,7 @@ int main() { // CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_LHS]], [[END]] // CHECK: br i1 [[ISEMPTY]], // CHECK: phi i32* -// CHECK: [[ADD:%.+]] = mul nsw i32 555, % -// CHECK: store i32 [[ADD]], i32* % +// CHECK: call void [[RED_COMB5]]( // CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -702,7 +725,7 @@ int main() { // CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]] // CHECK: br i1 [[ISEMPTY]], // CHECK: phi [[S_FLOAT_TY]]* -// CHECK: call void @_Z3redR6BaseS1RKS_(% +// CHECK: call void [[RED_COMB2]]( // CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]] // CHECK: br i1 [[DONE]], @@ -843,16 +866,16 @@ int main() { // CHECK: [[T_VAR1_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** % // For + reduction operation initial value of private variable is 0. -// CHECK: store i32 321, i32* % +// CHECK: call void [[RED_INIT6:@.+]]( // For & reduction operation initial value of private variable is ones in all bits. -// CHECK: call void @_Z4initR6BaseS1RKS_( +// CHECK: call void [[RED_INIT2:@.+]]( // For && reduction operation initial value of private variable is 1.0. -// CHECK: call void @_Z5init2R6BaseS1RKS_( +// CHECK: call void [[RED_INIT7:@.+]]( // For min reduction operation initial value of private variable is largest repesentable value. -// CHECK: sdiv i32 432, % +// CHECK: call void [[RED_INIT8:@.+]]( // CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]] // CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]] @@ -888,16 +911,16 @@ int main() { // case 1: // t_var += t_var_reduction; -// CHECK: add nsw i32 1513, % +// CHECK: call void [[RED_COMB6:@.+]]( // var = var.operator &(var_reduction); -// CHECK: call void @_Z3redR6BaseS1RKS_(% +// CHECK: call void [[RED_COMB2]]( // var1 = var1.operator &&(var1_reduction); -// CHECK: mul nsw i32 17, % +// CHECK: call void [[RED_COMB7:@.+]]( // t_var1 = min(t_var1, t_var1_reduction); -// CHECK: sub nsw i32 47, % +// CHECK: call void [[RED_COMB8:@.+]]( // __kmpc_end_reduce_nowait(, , &); // CHECK: call void @__kmpc_end_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]]) @@ -908,22 +931,22 @@ int main() { // case 2: // t_var += t_var_reduction; // CHECK: call void @__kmpc_critical( -// CHECK: add nsw i32 1513, % +// CHECK: call void [[RED_COMB6]]( // CHECK: call void @__kmpc_end_critical( // var = var.operator &(var_reduction); // CHECK: call void @__kmpc_critical( -// CHECK: call void @_Z3redR6BaseS1RKS_(% +// CHECK: call void [[RED_COMB2]]( // CHECK: call void @__kmpc_end_critical( // var1 = var1.operator &&(var1_reduction); // CHECK: call void @__kmpc_critical( -// CHECK: mul nsw i32 17, % +// CHECK: call void [[RED_COMB7]]( // CHECK: call void @__kmpc_end_critical( // t_var1 = min(t_var1, t_var1_reduction); // CHECK: call void @__kmpc_critical( -// CHECK: sub nsw i32 47, % +// CHECK: call void [[RED_COMB8]]( // CHECK: call void @__kmpc_end_critical( // break; @@ -933,6 +956,24 @@ int main() { // CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]* // CHECK: ret void +// CHECK: define internal void [[RED_COMB6]](i32* noalias, i32* noalias) +// CHECK: add nsw i32 1513, % + +// CHECK: define internal void [[RED_INIT6]](i32* noalias, i32* noalias) +// CHECK: store i32 321, i32* % + +// CHECK: define internal void [[RED_COMB7]]( +// CHECK: mul nsw i32 17, % + +// CHECK: define internal void [[RED_INIT7]]( +// CHECK: call void @_Z5init2R6BaseS1RKS_( + +// CHECK: define internal void [[RED_COMB8]](i32* noalias, i32* noalias) +// CHECK: sub nsw i32 47, % + +// CHECK: define internal void [[RED_INIT8]](i32* noalias, i32* noalias) +// CHECK: sdiv i32 432, % + // void reduce_func(void *lhs[], void *rhs[]) { // *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]); // ... @@ -977,16 +1018,16 @@ int main() { // CHECK: [[T_VAR1_LHS:%.+]] = bitcast i8* [[T_VAR1_LHS_VOID]] to i{{[0-9]+}}* // t_var_lhs += t_var_rhs; -// CHECK: add nsw i32 1513, % +// CHECK: call void [[RED_COMB6]]( // var_lhs = var_lhs.operator &(var_rhs); -// CHECK: call void @_Z3redR6BaseS1RKS_(% +// CHECK: call void [[RED_COMB2]]( // var1_lhs = var1_lhs.operator &&(var1_rhs); -// CHECK: mul nsw i32 17, % +// CHECK: call void [[RED_COMB7]]( // t_var1_lhs = min(t_var1_lhs, t_var1_rhs); -// CHECK: sub nsw i32 47, % +// CHECK: call void [[RED_COMB8]]( // CHECK: ret void // CHECK: define internal void [[TMAIN_MICROTASK2]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [42 x [[S_INT_TY]]]* dereferenceable(504) %{{.*}}, [2 x i32]* dereferenceable(8) %{{.*}}, i32* dereferenceable(4) %{{.*}}, [2 x [[S_INT_TY]]]* dereferenceable(24) %{{.*}}, [[S_INT_TY]]* dereferenceable(12) %{{.*}}) diff --git a/clang/test/OpenMP/nvptx_allocate_codegen.cpp b/clang/test/OpenMP/nvptx_allocate_codegen.cpp index 9a285d0d093ce..1214845a42011 100644 --- a/clang/test/OpenMP/nvptx_allocate_codegen.cpp +++ b/clang/test/OpenMP/nvptx_allocate_codegen.cpp @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -verify -fopenmp -triple x86_64-apple-darwin10.6.0 -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc -o %t-host.bc %s -// RUN: %clang_cc1 -verify -fopenmp -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-host.bc -o - -disable-llvm-optzns | FileCheck %s // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp index 7b21d827941c7..b9f050f293320 100644 --- a/clang/test/OpenMP/nvptx_data_sharing.cpp +++ b/clang/test/OpenMP/nvptx_data_sharing.cpp @@ -2,7 +2,7 @@ ///==========================================================================/// // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CK1 // expected-no-diagnostics diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp index 9470aa7972abf..7296e0e9d0d37 100644 --- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -1,9 +1,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // expected-no-diagnostics #ifndef HEADER #define HEADER @@ -25,12 +25,16 @@ int main(int argc, char **argv) { // CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40 +// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 // CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 0 // CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}) -// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 40, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], +// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty* +// CHECK: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0 +// CHECK: [[STACK:%.+]] = bitcast i8* [[GEP]] to %struct._globalized_locals_ty* // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 // CHECK-NOT: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], // CHECK: call void @__kmpc_for_static_init_4( @@ -39,7 +43,8 @@ int main(int argc, char **argv) { // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @ -// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 1) +// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) // CHECK: define internal void [[PARALLEL]]( // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp index f57fef638f435..252d237036f5e 100644 --- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp +++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp @@ -26,11 +26,11 @@ // CHECK-DAG: [[CAP1:%.+]] = type { [[S]]* } // CHECK-DAG: [[CAP2:%.+]] = type { i32*, i32*, i32*, i32**, i32* } -// CLASS: define internal void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l72_worker() -// CLASS: define weak void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l72([[S]]* {{%.+}}, [[CAP1]]* dereferenceable(8) {{%.+}}) +// CLASS: define internal void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67_worker() +// CLASS: define weak void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67([[S]]* {{%.+}}, [[CAP1]]* dereferenceable(8) {{%.+}}) // CLASS-NOT: getelementptr // CLASS: br i1 % -// CLASS: call void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l72_worker() +// CLASS: call void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67_worker() // CLASS: br label % // CLASS: br i1 % // CLASS: call void @__kmpc_kernel_init( @@ -43,12 +43,7 @@ // CLASS: call i32 [[LAMBDA1:@.+foo.+]]([[CAP1]]* [[L]]) // CLASS: ret void -// CLASS: define weak void @__omp_offloading_{{.+}}foo{{.+}}_l74([[S]]* %{{.+}}, [[CAP1]]* dereferenceable(8) %{{.+}}) -// CLASS-NOT: getelementptr -// CLASS: call void [[PARALLEL:@.+]](i32* %{{.+}}, i32* %{{.+}}, [[S]]* %{{.+}}, [[CAP1]]* %{{.+}}) -// CLASS: ret void - -// CLASS: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, [[S]]* %{{.+}}, [[CAP1]]* dereferenceable(8) %{{.+}}) +// CLASS: define weak void @__omp_offloading_{{.+}}foo{{.+}}_l69([[S]]* %{{.+}}, [[CAP1]]* dereferenceable(8) %{{.+}}) // CLASS-NOT: getelementptr // CLASS: call void @llvm.memcpy. // CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR:%.+]], @@ -77,11 +72,11 @@ struct S { } } s; -// FUN: define internal void @__omp_offloading_{{.+}}_main_l134_worker() -// FUN: define weak void @__omp_offloading_{{.+}}_main_l134(i32* dereferenceable(4) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* dereferenceable(4) %{{.+}}, [[CAP2]]* dereferenceable(40) %{{.+}}, i64 %{{.+}}) +// FUN: define internal void @__omp_offloading_{{.+}}_main_l124_worker() +// FUN: define weak void @__omp_offloading_{{.+}}_main_l124(i32* dereferenceable(4) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* dereferenceable(4) %{{.+}}, [[CAP2]]* dereferenceable(40) %{{.+}}, i64 %{{.+}}) // FUN-NOT: getelementptr // FUN: br i1 % -// FUN: call void @__omp_offloading_{{.*}}_{{.*}}main{{.*}}_l134_worker() +// FUN: call void @__omp_offloading_{{.*}}_{{.*}}main{{.*}}_l124_worker() // FUN: br label % // FUN: br i1 % // FUN: call void @__kmpc_kernel_init( @@ -98,19 +93,14 @@ struct S { // FUN: store i32** %{{.+}}, i32*** [[D_CAP]], // FUN: [[A_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 4 // FUN: store i32* %{{.+}}, i32** [[A_CAP]], -// FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR:%.+]], +// FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR]], // FUN: call i64 [[LAMBDA2:@.+main.+]]([[CAP2]]* [[L]]) // FUN: ret void -// FUN: define weak void @__omp_offloading_{{.+}}_main_l136(i32* dereferenceable(4) %{{.+}}, i32* dereferenceable(4) %{{.+}} i32* dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* dereferenceable(4) %{{.+}}, [[CAP2]]* dereferenceable(40) %{{.+}}) -// FUN-NOT: getelementptr -// FUN: call void [[PARALLEL:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, i32* %{{.+}}, [[CAP2]]* %{{.+}}) -// FUN: ret void - -// FUN: define internal void [[PARALLEL:@.+]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* dereferenceable(4) %{{.+}}, [[CAP2]]* dereferenceable(40) %{{.+}}) +// FUN: define weak void @__omp_offloading_{{.+}}_main_l126(i32* dereferenceable(4) %{{.+}}, i32* dereferenceable(4) %{{.+}} i32* dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* dereferenceable(4) %{{.+}}, [[CAP2]]* dereferenceable(40) %{{.+}}) // FUN-NOT: getelementptr // FUN: call void @llvm.memcpy. -// FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR]], +// FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR:%.+]], // FUN: [[ARGC_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 0 // FUN: store i32* %{{.+}}, i32** [[ARGC_CAP]], // FUN: [[B_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 1 diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp index 04089ce3f5b62..cdbc8872445d9 100644 --- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp @@ -1,9 +1,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // expected-no-diagnostics #ifndef HEADER #define HEADER diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp index 1446ba50cea85..94b4959853d8e 100644 --- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp @@ -1,6 +1,6 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // expected-no-diagnostics #ifndef HEADER #define HEADER diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp index 906ff928c475e..85c85a3078ed5 100644 --- a/clang/test/OpenMP/nvptx_target_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_codegen.cpp @@ -1,9 +1,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // expected-no-diagnostics #ifndef HEADER diff --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp index 7964d768d799c..029e4a469a698 100644 --- a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp @@ -1,9 +1,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // expected-no-diagnostics #ifndef HEADER #define HEADER diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp index fc90d34da3640..1aac48198415c 100644 --- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp @@ -1,9 +1,9 @@ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // expected-no-diagnostics #ifndef HEADER #define HEADER diff --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp index 069eecb6079ec..b96b5da8f6978 100644 --- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp @@ -127,9 +127,7 @@ int bar(int n){ // CHECK: [[ACP:%.+]] = bitcast i[[SZ]]* [[AC:%.+]] to i8* // CHECK: store i8 [[A_VAL]], i8* [[ACP]], align // CHECK: [[ACV:%.+]] = load i[[SZ]], i[[SZ]]* [[AC]], align - // CHECK: store i[[SZ]] [[ACV]], i[[SZ]]* [[A_ADDR_T:%.+]], align - // CHECK: [[CONV2:%.+]] = bitcast i[[SZ]]* [[A_ADDR_T]] to i8* - // CHECK: store i8 49, i8* [[CONV2]], align + // CHECK: call void [[PARALLEL:@.+]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] [[ACV]]) // CHECK: br label {{%?}}[[TERMINATE:.+]] // // CHECK: [[TERMINATE]] @@ -140,10 +138,12 @@ int bar(int n){ // CHECK: [[EXIT]] // CHECK: ret void - - - - + // CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i[[SZ]] [[A_VAL:%.+]]) + // CHECK: [[A_ADDR:%.+]] = alloca i[[SZ]], + // CHECK: store i[[SZ]] [[A_VAL]], i[[SZ]]* [[A_ADDR]], + // CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i8* + // CHECK: store i8 49, i8* [[CONV]], + // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l32}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, @@ -213,9 +213,7 @@ int bar(int n){ // CHECK: [[ACP:%.+]] = bitcast i[[SZ]]* [[AC:%.+]] to i16* // CHECK: store i16 [[AA_VAL]], i16* [[ACP]], align // CHECK: [[ACV:%.+]] = load i[[SZ]], i[[SZ]]* [[AC]], align - // CHECK: store i[[SZ]] [[ACV]], i[[SZ]]* [[AA_ADDR_T:%.+]], align - // CHECK: [[CONV2:%.+]] = bitcast i[[SZ]]* [[AA_ADDR_T]] to i16* - // CHECK: store i16 1, i16* [[CONV2]], align + // CHECK: call void [[PARALLEL:@.+]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] [[ACV]]) // CHECK: br label {{%?}}[[TERMINATE:.+]] // // CHECK: [[TERMINATE]] @@ -226,24 +224,35 @@ int bar(int n){ // CHECK: [[EXIT]] // CHECK: ret void + // CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i[[SZ]] [[A_VAL:%.+]]) + // CHECK: [[A_ADDR:%.+]] = alloca i[[SZ]], + // CHECK: store i[[SZ]] [[A_VAL]], i[[SZ]]* [[A_ADDR]], + // CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i16* + // CHECK: store i16 1, i16* [[CONV]], + // CHECK: ret void + // CHECK: define weak void @__omp_offloading_{{.*}}ftemplate{{.*}}_l37( // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( // CHECK-NOT: call void @__kmpc_serialized_parallel( -// CHECK: call void [[L0:@.+]](i32* %{{.+}}, i32* %{{.+}}, i16* %{{.*}}) +// CHECK: call void [[L0:@.+]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.+}}) // CHECK-NOT: call void @__kmpc_end_serialized_parallel( // CHECK-NOT: call void @__kmpc_data_sharing_pop_stack( // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) // CHECK: ret -// CHECK: define internal void [[L0]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i16* dereferenceable -// CHECK: call void @__kmpc_serialized_parallel( +// CHECK: define internal void [[L0]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i[[SZ]] %{{.+}}) // CHECK: call void [[L1:@.+]](i32* %{{.+}}, i32* %{{.+}}, i16* %{{.+}}) -// CHECK: call void @__kmpc_end_serialized_parallel( // CHECK: ret void // CHECK: define internal void [[L1]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i16* dereferenceable +// CHECK: call void @__kmpc_serialized_parallel( +// CHECK: call void [[L2:@.+]](i32* %{{.+}}, i32* %{{.+}}, i16* %{{.+}}) +// CHECK: call void @__kmpc_end_serialized_parallel( +// CHECK: ret void + +// CHECK: define internal void [[L2]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i16* dereferenceable // CHECK: store i16 1, i16* % // CHECK: ret void diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp index 3a0e5138277ec..abd9f4ae715f2 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp @@ -9,8 +9,9 @@ #define HEADER // CHECK: [[MEM_TY:%.+]] = type { [128 x i8] } -// CHECK-DAG: {{@__omp_offloading_.+}}_l19_exec_mode = weak constant i8 1 -// CHECK-DAG: internal unnamed_addr constant i{{64|32}} 4 +// CHECK-DAG: {{@__omp_offloading_.+}}_l20_exec_mode = weak constant i8 1 +// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 +// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 template tx ftemplate(int n) { @@ -34,10 +35,10 @@ int bar(int n){ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l19}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l20}}_worker() // CHECK: ret void - // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l19}}() + // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l20}}() // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() @@ -47,7 +48,7 @@ int bar(int n){ // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] // // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void {{@__omp_offloading_.+template.+l19}}_worker() + // CHECK: {{call|invoke}} void {{@__omp_offloading_.+template.+l20}}_worker() // CHECK: br label {{%?}}[[EXIT:.+]] // // CHECK: [[CHECK_MASTER]] @@ -62,29 +63,35 @@ int bar(int n){ // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[BUF:@.+]] to i8**)) + // CHECK: call void [[PARALLEL:@.+]](i32* %{{.+}}, i32* %{{.+}}) + // CHECK: br label {{%?}}[[TERMINATE:.+]] + // + // CHECK: [[TERMINATE]] + // CHECK: call void @__kmpc_kernel_deinit( + // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + // CHECK: br label {{%?}}[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + + // CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}) + // CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], + // CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], + // CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[BUF:@.+]] to i8**)) // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[BUF]], - // CHECK: [[RD:%.+]] = bitcast i8* [[PTR]] to [[GLOB_TY:%.+]]* + // CHECK: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0 + // CHECK: [[RD:%.+]] = bitcast i8* [[ADDR]] to [[GLOB_TY:%.+]]* // CHECK: [[I_ADDR:%.+]] = getelementptr inbounds [[GLOB_TY]], [[GLOB_TY]]* [[RD]], i32 0, i32 0 // // CHECK: call void @__kmpc_for_static_init_4( // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @{{.+}} to i8*), i16 1) // CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_VARS_PTR:%.+]], i{{64|32}} 1) // CHECK: [[SHARED_VARS_BUF:%.+]] = load i8**, i8*** [[SHARED_VARS_PTR]], + // CHECK: [[VARS_BUF:%.+]] = getelementptr inbounds i8*, i8** [[SHARED_VARS_BUF]], i{{64|32}} 0 // CHECK: [[I_ADDR_BC:%.+]] = bitcast i32* [[I_ADDR]] to i8* - // CHECK: store i8* [[I_ADDR_BC]], i8** [[SHARED_VARS_BUF]], + // CHECK: store i8* [[I_ADDR_BC]], i8** [[VARS_BUF]], // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CHECK: call void @__kmpc_end_sharing_variables() // CHECK: call void @__kmpc_for_static_fini( - // CHECK: br label {{%?}}[[TERMINATE:.+]] - // - // CHECK: [[TERMINATE]] - // CHECK: call void @__kmpc_kernel_deinit( - // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) - // CHECK: br label {{%?}}[[EXIT]] - // - // CHECK: [[EXIT]] - // CHECK: ret void - #endif diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp index fe0c8dfb63fd9..206b999caa919 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -85,15 +85,22 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l34( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) -// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CHECK: call void [[PARALLEL:@.+]]( +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) + +// CHECK: define internal void [[PARALLEL]]( +// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], +// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CHECK: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CHECK: [[BC:%.+]] = bitcast i8* [[TEAM_ALLOC]] to [[REC:%.+]]* +// CHECK: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0 +// CHECK: [[BC:%.+]] = bitcast i8* [[ADDR]] to [[REC:%.+]]* // CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: {{call|invoke}} void [[OUTL1:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 1) -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) // CHECK: ret void // CHECK: define internal void [[OUTL1]]( @@ -104,11 +111,12 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK: ret void + // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: {{call|invoke}} void [[OUTL2:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK: ret void // CHECK: define internal void [[OUTL2]]( // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, @@ -118,11 +126,12 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK: ret void + // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: {{call|invoke}} void [[OUTL3:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK: ret void // CHECK: define internal void [[OUTL3]]( // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 33, @@ -131,6 +140,11 @@ int bar(int n){ // Distribute with collapse(2) // CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]]) +// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK: ret void + // CHECK: alloca // CHECK: alloca // CHECK: alloca @@ -142,8 +156,6 @@ int bar(int n){ // CHECK: [[OMP_UB:%.+]] = alloca // CHECK: [[OMP_ST:%.+]] = alloca // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}}, -// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]], @@ -213,8 +225,7 @@ int bar(int n){ // CHECK: [[DIST_INNER_LOOP_END]]: // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) -// CHECK: ret void + // CHECK-32: define internal void [[OUTL4]]( // CHECK-64: define internal void [[OUTL4]]( diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp index f7ce26296139b..b000b24d997ee 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp @@ -24,6 +24,8 @@ int main(int argc, char **argv) { // CHECK: define weak void @__omp_offloading_{{.*}}_main_l16(i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}}) // CHECK: call void @__kmpc_spmd_kernel_init( // CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @ +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) + // CHECK: call void @__kmpc_for_static_init_4( // CHECK: call void [[PARALLEL:@.+]](i32* %{{.*}}, i32* %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.*}}, i{{64|32}} %{{.*}}, i32* %{{.*}}) @@ -32,8 +34,6 @@ int main(int argc, char **argv) { // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @ -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) - // CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} [[ARGC:%.+]], i32* dereferenceable{{.*}}) // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( // CHECK: alloca i{{[0-9]+}}, diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp index 21fb46fc51d31..2d43ffed70163 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -71,15 +71,20 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l30( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) -// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) + +// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], +// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CHECK: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], -// CHECK: [[BC:%.+]] = bitcast i8* [[TEAM_ALLOC]] to [[REC:%.+]]* +// CHECK: [[PTR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0 +// CHECK: [[BC:%.+]] = bitcast i8* [[PTR]] to [[REC:%.+]]* // CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: {{call|invoke}} void [[OUTL1:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 1) -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) +// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], +// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]]) // CHECK: ret void // CHECK: define internal void [[OUTL1]]( @@ -90,10 +95,11 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) + // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: {{call|invoke}} void [[OUTL2:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void // CHECK: define internal void [[OUTL2]]( @@ -104,10 +110,11 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) + // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: {{call|invoke}} void [[OUTL3:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void // CHECK: define internal void [[OUTL3]]( @@ -119,11 +126,12 @@ int bar(int n){ // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}}, // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) + // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]], // CHECK: {{call|invoke}} void [[OUTL4:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void // CHECK: define internal void [[OUTL4]]( diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp index 6051637d553b4..091e1f8d0a2b1 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp @@ -64,32 +64,36 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l30( // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) + // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l36( // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) + // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l41( // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) + // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void // CHECK: define {{.*}}void {{@__omp_offloading_.+}}_l46({{.+}}, i{{32|64}} [[F_IN:%.+]]) // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}}, // CHECK: call void @__kmpc_spmd_kernel_init(i32 %{{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) + // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]], // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void #endif diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp index 2d1ab9f063624..52b65fca4d8d5 100644 --- a/clang/test/OpenMP/nvptx_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp @@ -37,9 +37,6 @@ int main (int argc, char **argv) { // only nvptx side: do not outline teams region and do not call fork_teams // CK1: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]]) -// CK1: {{.+}} = alloca i{{[0-9]+}}*, -// CK1: {{.+}} = alloca i{{[0-9]+}}*, -// CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*, // CK1: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}, // CK1: store {{.+}} 0, {{.+}}, // CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]], @@ -53,16 +50,16 @@ int main (int argc, char **argv) { // CK1-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] // CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CK1: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]], -// CK1: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]], -// CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]], -// CK1: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]], -// CK1-NOT: call {{.*}}void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( +// CK1: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGCADDR]]) // CK1: ret void // CK1-NEXT: } +// CK1: define internal void [[OUTLINED]]( +// CK1: store i{{[0-9]+}} 0, i{{[0-9]+}}* % +// CK1-NOT: call {{.*}}void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( + // target region in template // CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}** [[ARGC:%.+]]) -// CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***, // CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**, // CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]] // CK1: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]], @@ -73,13 +70,14 @@ int main (int argc, char **argv) { // CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] // CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], -// CK1: store i8*** [[ARGCADDR]], i8**** [[ARGCADDR_PTR]], -// CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}**, i{{.+}}*** [[ARGCADDR_PTR]], -// CK1: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]], -// CK1-NOT: call {{.*}}void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( +// CK1: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGCADDR]]) // CK1: ret void // CK1-NEXT: } +// CK1: define internal void [[OUTLINED]]( +// CK1: store i{{[0-9]+}}** null, i{{[0-9]+}}*** % +// CK1-NOT: call {{.*}}void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( + #endif // CK1 @@ -123,9 +121,6 @@ int main (int argc, char **argv) { // CK2-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 // CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]]) -// CK2: {{.}} = alloca i{{[0-9]+}}*, -// CK2: {{.}} = alloca i{{[0-9]+}}*, -// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*, // CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}}, // CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}}, // CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}, @@ -145,15 +140,15 @@ int main (int argc, char **argv) { // CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CK2: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]], // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num( -// CK2: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]], -// CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]], -// CK2: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]], -// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams( -// CK2-NOT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( +// CK2: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGCADDR]]) // CK2: ret +// CK2: define internal void [[OUTLINED]]( +// CK2: store i{{[0-9]+}} 0, i{{[0-9]+}}* % +// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams( +// CK2-NOT: call {{.*}}void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( + // CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[BP:%.+]], i{{[0-9]+}}** [[ARGC:%.+]]) -// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}***, // CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}}, // CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}}, // CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}**, @@ -169,12 +164,13 @@ int main (int argc, char **argv) { // CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num( -// CK2: store i{{[0-9]+}}*** [[ARGCADDR]], i{{[0-9]+}}**** [[ARGCADDR_PTR]], -// CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR_PTR]], -// CK2: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]], -// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams( -// CK2-NOT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( +// CK2: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGCADDR]]) // CK2: ret void +// CK2: define internal void [[OUTLINED]]( +// CK2: store i{{[0-9]+}}** null, i{{[0-9]+}}*** % +// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams( +// CK2-NOT: call {{.*}}void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( + #endif // CK2 #endif diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp index a8e22d82b2c2a..9048336f3fa07 100644 --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -77,6 +77,7 @@ int bar(int n){ // CHECK: {{call|invoke}} void [[T1]]_worker() // // CHECK: call void @__kmpc_kernel_init( + // CHECK: call void @__kmpc_kernel_deinit( // // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align // CHECK: [[EV:%.+]] = load double, double* [[E]], align @@ -86,7 +87,8 @@ int bar(int n){ // CHECK: [[BC:%.+]] = bitcast double* [[E]] to i8* // CHECK: store i8* [[BC]], i8** [[GEP1]], // CHECK: [[BC_RED_LIST:%.+]] = bitcast [1 x i8*]* [[RED_LIST]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* bitcast ([[TEAMS_REDUCE_UNION_TY]]* [[TEAMS_RED_BUFFER]] to i8*), i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) + // CHECK: [[BUF:%.+]] = load i8*, i8** @ + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* [[BUF]], i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // @@ -99,7 +101,6 @@ int bar(int n){ // CHECK: br label %[[EXIT]] // // CHECK: [[EXIT]] - // CHECK: call void @__kmpc_kernel_deinit( // // Reduction function @@ -344,6 +345,7 @@ int bar(int n){ // // CHECK: call void @__kmpc_kernel_init( + // CHECK: call void @__kmpc_kernel_deinit( // // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align @@ -360,7 +362,8 @@ int bar(int n){ // CHECK: [[BC:%.+]] = bitcast float* [[D]] to i8* // CHECK: store i8* [[BC]], i8** [[GEP2]], // CHECK: [[BC_RED_LIST:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* bitcast ([[TEAMS_REDUCE_UNION_TY]]* [[TEAMS_RED_BUFFER]] to i8*), i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) + // CHECK: [[BUF:%.+]] = load i8*, i8** @ + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* [[BUF]], i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // @@ -380,7 +383,6 @@ int bar(int n){ // CHECK: br label %[[EXIT]] // // CHECK: [[EXIT]] - // CHECK: call void @__kmpc_kernel_deinit( // // Reduction function @@ -706,6 +708,8 @@ int bar(int n){ // // CHECK: call void @__kmpc_spmd_kernel_init( // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() + // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) + // CHECK-NOT: call void @__kmpc_get_team_static_memory // CHECK: store i32 0, // CHECK: store i32 0, i32* [[A_ADDR:%.+]], align @@ -718,7 +722,8 @@ int bar(int n){ // CHECK: [[BC:%.+]] = bitcast i16* [[B_ADDR]] to i8* // CHECK: store i8* [[BC]], i8** [[GEP2]], // CHECK: [[BC_RED_LIST:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* bitcast ([[TEAMS_REDUCE_UNION_TY]]* [[TEAMS_RED_BUFFER]] to i8*), i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) + // CHECK: [[BUF:%.+]] = load i8*, i8** @ + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* [[BUF]], i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // @@ -749,7 +754,6 @@ int bar(int n){ // CHECK: br label %[[EXIT]] // // CHECK: [[EXIT]] - // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.+}}, i16* dereferenceable{{.+}}) // diff --git a/clang/test/OpenMP/taskloop_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_reduction_codegen.cpp index 0eff06d886783..3c6227b173aef 100644 --- a/clang/test/OpenMP/taskloop_reduction_codegen.cpp +++ b/clang/test/OpenMP/taskloop_reduction_codegen.cpp @@ -167,9 +167,15 @@ sum = 0.0; // CHECK: [[ORIG_PTR_ADDR:%.+]] = call i8* @__kmpc_threadprivate_cached( // CHECK: [[ORIG_PTR_REF:%.+]] = bitcast i8* [[ORIG_PTR_ADDR]] to i8** // CHECK: load i8*, i8** [[ORIG_PTR_REF]], -// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64( +// CHECK: call void [[OMP_INIT1:@.+]]( // CHECK: ret void +// CHECK: define internal void [[OMP_COMB1:@.+]](%struct.S* noalias, %struct.S* noalias) +// CHECK: fadd float % + +// CHECK: define internal void [[OMP_INIT1]](%struct.S* noalias, %struct.S* noalias) +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64( + // CHECK: define internal void @[[RED_FINI2]](i8*) // CHECK: call i8* @__kmpc_threadprivate_cached( // CHECK: call void @ @@ -177,8 +183,7 @@ sum = 0.0; // CHECK: define internal void @[[RED_COMB2]](i8*, i8*) // CHECK: call i8* @__kmpc_threadprivate_cached( -// CHECK: fadd float % -// CHECK: store float %{{.+}}, float* % +// CHECK: call void [[OMP_COMB1]]( // CHECK: ret void // CHECK: define internal void @[[RED_INIT3]](i8*) diff --git a/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp index 37a60c8b6755b..75d1faf58f28f 100644 --- a/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp +++ b/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp @@ -165,9 +165,15 @@ sum = 0.0; // CHECK: define internal void @[[RED_INIT2]](i8*) // CHECK: call i8* @__kmpc_threadprivate_cached( // CHECK: call i8* @__kmpc_threadprivate_cached( -// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64( +// CHECK: call void [[OMP_INIT1:@.+]]( // CHECK: ret void +// CHECK: define internal void [[OMP_COMB1:@.+]](%struct.S* noalias, %struct.S* noalias) +// CHECK: fadd float % + +// CHECK: define internal void [[OMP_INIT1]](%struct.S* noalias, %struct.S* noalias) +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64( + // CHECK: define internal void @[[RED_FINI2]](i8*) // CHECK: call i8* @__kmpc_threadprivate_cached( // CHECK: call void @ @@ -175,8 +181,7 @@ sum = 0.0; // CHECK: define internal void @[[RED_COMB2]](i8*, i8*) // CHECK: call i8* @__kmpc_threadprivate_cached( -// CHECK: fadd float % -// CHECK: store float %{{.+}}, float* % +// CHECK: call void [[OMP_COMB1]]( // CHECK: ret void // CHECK: define internal void @[[RED_INIT3]](i8*)