diff --git a/clang/lib/CodeGen/ABIInfoImpl.cpp b/clang/lib/CodeGen/ABIInfoImpl.cpp index 887a645a4783a..8ad77ee515d49 100644 --- a/clang/lib/CodeGen/ABIInfoImpl.cpp +++ b/clang/lib/CodeGen/ABIInfoImpl.cpp @@ -442,7 +442,7 @@ Address CodeGen::EmitVAArgInstr(CodeGenFunction &CGF, Address VAListAddr, assert(!AI.getCoerceToType() && "Unexpected CoerceToType seen in arginfo in generic VAArg emitter!"); - Address Temp = CGF.CreateMemTempWithoutCast(Ty, "varet"); + Address Temp = CGF.CreateMemTemp(Ty, "varet"); Val = CGF.Builder.CreateVAArg(VAListAddr.emitRawPointer(CGF), CGF.ConvertTypeForMem(Ty)); CGF.Builder.CreateStore(Val, Temp); diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp index 270965b109943..b4fd0fdb795aa 100644 --- a/clang/lib/CodeGen/CGAtomic.cpp +++ b/clang/lib/CodeGen/CGAtomic.cpp @@ -304,7 +304,7 @@ Address AtomicInfo::CreateTempAlloca() const { ? ValueTy : AtomicTy.getUnqualifiedType(); Address TempAlloca = - CGF.CreateMemTempWithoutCast(TmpTy, getAtomicAlignment(), "atomic-temp"); + CGF.CreateMemTemp(TmpTy, getAtomicAlignment(), "atomic-temp"); // Cast to pointer to value type for bitfields. if (LVal.isBitField()) return CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( @@ -826,7 +826,7 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest, // into a temporary alloca. static Address EmitValToTemp(CodeGenFunction &CGF, Expr *E) { - Address DeclPtr = CGF.CreateMemTempWithoutCast(E->getType(), ".atomictmp"); + Address DeclPtr = CGF.CreateMemTemp(E->getType(), ".atomictmp"); CGF.EmitAnyExprToMem(E, DeclPtr, E->getType().getQualifiers(), /*Init*/ true); return DeclPtr; @@ -1025,7 +1025,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) { CharUnits PointeeIncAmt = getContext().getTypeSizeInChars(MemTy->getPointeeType()); Val1Scalar = Builder.CreateMul(Val1Scalar, CGM.getSize(PointeeIncAmt)); - auto Temp = CreateMemTempWithoutCast(Val1Ty, ".atomictmp"); + auto Temp = CreateMemTemp(Val1Ty, ".atomictmp"); Val1 = Temp; EmitStoreOfScalar(Val1Scalar, MakeAddrLValue(Temp, Val1Ty)); break; @@ -1121,7 +1121,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) { if (ShouldCastToIntPtrTy) Dest = Atomics.castToAtomicIntPointer(Dest); } else if (E->isCmpXChg()) - Dest = CreateMemTempWithoutCast(RValTy, "cmpxchg.bool"); + Dest = CreateMemTemp(RValTy, "cmpxchg.bool"); else if (!RValTy->isVoidType()) { Dest = Atomics.CreateTempAlloca(); if (ShouldCastToIntPtrTy) diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp index 0683a4937cf37..1ce22df11e6a7 100644 --- a/clang/lib/CodeGen/CGBlocks.cpp +++ b/clang/lib/CodeGen/CGBlocks.cpp @@ -1418,8 +1418,7 @@ void CodeGenFunction::setBlockContextParameter(const ImplicitParamDecl *D, // Allocate a stack slot like for any local variable to guarantee optimal // debug info at -O0. The mem2reg pass will eliminate it when optimizing. - RawAddress alloc = - CreateMemTempWithoutCast(D->getType(), D->getName() + ".addr"); + RawAddress alloc = CreateMemTemp(D->getType(), D->getName() + ".addr"); Builder.CreateStore(arg, alloc); if (CGDebugInfo *DI = getDebugInfo()) { if (CGM.getCodeGenOpts().hasReducedDebugInfo()) { @@ -1558,8 +1557,8 @@ llvm::Function *CodeGenFunction::GenerateBlockFunction( if (!capture.isConstant()) continue; CharUnits align = getContext().getDeclAlign(variable); - Address alloca = CreateMemTempWithoutCast(variable->getType(), align, - "block.captured-const"); + Address alloca = + CreateMemTemp(variable->getType(), align, "block.captured-const"); Builder.CreateStore(capture.getConstant(), alloca); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 50d34889d8dc1..9ceda41c69da5 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -2276,9 +2276,10 @@ RValue CodeGenFunction::emitBuiltinOSLogFormat(const CallExpr &E) { if (!isa(ArgVal)) { CleanupKind Cleanup = getARCCleanupKind(); QualType Ty = TheExpr->getType(); - RawAddress Alloca = CreateMemTempWithoutCast(Ty, "os.log.arg"); + RawAddress Alloca = RawAddress::invalid(); + RawAddress Addr = CreateMemTemp(Ty, "os.log.arg", &Alloca); ArgVal = EmitARCRetain(Ty, ArgVal); - Builder.CreateStore(ArgVal, Alloca); + Builder.CreateStore(ArgVal, Addr); pushLifetimeExtendedDestroy(Cleanup, Alloca, Ty, CodeGenFunction::destroyARCStrongPrecise, Cleanup & EHCleanup); @@ -6692,8 +6693,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, getContext().getSizeType(), ArraySize, nullptr, ArraySizeModifier::Normal, /*IndexTypeQuals=*/0); - auto Tmp = CreateMemTempWithoutCast(SizeArrayTy, "block_sizes"); - llvm::Value *Alloca = Tmp.getPointer(); + auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes"); + llvm::Value *TmpPtr = Tmp.getPointer(); + // The EmitLifetime* pair expect a naked Alloca as their last argument, + // however for cases where the default AS is not the Alloca AS, Tmp is + // actually the Alloca ascasted to the default AS, hence the + // stripPointerCasts() + llvm::Value *Alloca = TmpPtr->stripPointerCasts(); llvm::Value *ElemPtr; EmitLifetimeStart(Alloca); // Each of the following arguments specifies the size of the corresponding @@ -6710,6 +6716,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, Builder.CreateAlignedStore( V, GEP, CGM.getDataLayout().getPrefTypeAlign(SizeTy)); } + // Return the Alloca itself rather than a potential ascast as this is only + // used by the paired EmitLifetimeEnd. return {ElemPtr, Alloca}; }; @@ -7140,7 +7148,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, // always just emit into it. TypeEvaluationKind EvalKind = getEvaluationKind(E->getType()); if (EvalKind == TEK_Aggregate && ReturnValue.isNull()) { - Address DestPtr = CreateMemTempWithoutCast(E->getType(), "agg.tmp"); + Address DestPtr = CreateMemTemp(E->getType(), "agg.tmp"); ReturnValue = ReturnValueSlot(DestPtr, false); } diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 292db6665b8e8..a727a006969d8 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -447,10 +447,10 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, // Create temporary dim3 grid_dim, block_dim. ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); QualType Dim3Ty = GridDimParam->getType(); - Address GridDim = CGF.CreateMemTempWithoutCast( - Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); - Address BlockDim = CGF.CreateMemTempWithoutCast( - Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); + Address GridDim = + CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); + Address BlockDim = + CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); Address ShmemSize = CGF.CreateTempAlloca(SizeTy, LangAS::Default, CGM.getSizeAlign(), "shmem_size"); Address Stream = CGF.CreateTempAlloca(PtrTy, LangAS::Default, diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 40cc275d40273..2468394929360 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -3373,7 +3373,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, // may be aliased, copy it to ensure that the parameter variable is // mutable and has a unique adress, as C requires. if (ArgI.getIndirectRealign() || ArgI.isIndirectAliased()) { - RawAddress AlignedTemp = CreateMemTempWithoutCast(Ty, "coerce"); + RawAddress AlignedTemp = CreateMemTemp(Ty, "coerce"); // Copy from the incoming argument pointer to the temporary with the // appropriate alignment. @@ -3503,8 +3503,8 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, ParameterABI::SwiftErrorResult) { QualType pointeeTy = Ty->getPointeeType(); assert(pointeeTy->isPointerType()); - RawAddress temp = CreateMemTempWithoutCast( - pointeeTy, getPointerAlign(), "swifterror.temp"); + RawAddress temp = + CreateMemTemp(pointeeTy, getPointerAlign(), "swifterror.temp"); Address arg = makeNaturalAddressForPointer( V, pointeeTy, getContext().getTypeAlignInChars(pointeeTy)); llvm::Value *incomingErrorValue = Builder.CreateLoad(arg); @@ -3556,8 +3556,8 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, llvm::StructType *STy = dyn_cast(ArgI.getCoerceToType()); - Address Alloca = CreateMemTempWithoutCast( - Ty, getContext().getDeclAlign(Arg), Arg->getName()); + Address Alloca = + CreateMemTemp(Ty, getContext().getDeclAlign(Arg), Arg->getName()); // Pointer to store into. Address Ptr = emitAddressAtOffset(*this, Alloca, ArgI); @@ -3646,8 +3646,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, case ABIArgInfo::CoerceAndExpand: { // Reconstruct into a temporary. - Address alloca = - CreateMemTempWithoutCast(Ty, getContext().getDeclAlign(Arg)); + Address alloca = CreateMemTemp(Ty, getContext().getDeclAlign(Arg)); ArgVals.push_back(ParamValue::forIndirect(alloca)); auto coercionType = ArgI.getCoerceAndExpandType(); @@ -3688,8 +3687,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, // If this structure was expanded into multiple arguments then // we need to create a temporary and reconstruct it from the // arguments. - Address Alloca = - CreateMemTempWithoutCast(Ty, getContext().getDeclAlign(Arg)); + Address Alloca = CreateMemTemp(Ty, getContext().getDeclAlign(Arg)); LValue LV = MakeAddrLValue(Alloca, Ty); ArgVals.push_back(ParamValue::forIndirect(Alloca)); @@ -3706,8 +3704,8 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, case ABIArgInfo::TargetSpecific: { auto *AI = Fn->getArg(FirstIRArg); AI->setName(Arg->getName() + ".target_coerce"); - Address Alloca = CreateMemTempWithoutCast( - Ty, getContext().getDeclAlign(Arg), Arg->getName()); + Address Alloca = + CreateMemTemp(Ty, getContext().getDeclAlign(Arg), Arg->getName()); Address Ptr = emitAddressAtOffset(*this, Alloca, ArgI); CGM.getABIInfo().createCoercedStore(AI, Ptr, ArgI, false, *this); if (CodeGenFunction::hasScalarEvaluationKind(Ty)) { @@ -3726,8 +3724,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI, assert(NumIRArgs == 0); // Initialize the local variable appropriately. if (!hasScalarEvaluationKind(Ty)) { - ArgVals.push_back( - ParamValue::forIndirect(CreateMemTempWithoutCast(Ty))); + ArgVals.push_back(ParamValue::forIndirect(CreateMemTemp(Ty))); } else { llvm::Value *U = llvm::UndefValue::get(ConvertType(Arg->getType())); ArgVals.push_back(ParamValue::forDirect(U)); @@ -5034,7 +5031,7 @@ struct DestroyUnpassedArg final : EHScopeStack::Cleanup { RValue CallArg::getRValue(CodeGenFunction &CGF) const { if (!HasLV) return RV; - LValue Copy = CGF.MakeAddrLValue(CGF.CreateMemTempWithoutCast(Ty), Ty); + LValue Copy = CGF.MakeAddrLValue(CGF.CreateMemTemp(Ty), Ty); CGF.EmitAggregateCopy(Copy, LV, Ty, AggValueSlot::DoesNotOverlap, LV.isVolatile()); IsUsed = true; @@ -5613,8 +5610,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, // For indirect things such as overaligned structs, replace the // placeholder with a regular aggregate temporary alloca. Store the // address of this alloca into the struct. - Addr = - CreateMemTempWithoutCast(info_it->type, "inalloca.indirect.tmp"); + Addr = CreateMemTemp(info_it->type, "inalloca.indirect.tmp"); Address ArgSlot = Builder.CreateStructGEP( ArgMemory, ArgInfo.getInAllocaFieldIndex()); Builder.CreateStore(Addr.getPointer(), ArgSlot); @@ -5759,8 +5755,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, swiftErrorArg = makeNaturalAddressForPointer( V, pointeeTy, getContext().getTypeAlignInChars(pointeeTy)); - swiftErrorTemp = CreateMemTempWithoutCast( - pointeeTy, getPointerAlign(), "swifterror.temp"); + swiftErrorTemp = + CreateMemTemp(pointeeTy, getPointerAlign(), "swifterror.temp"); V = swiftErrorTemp.getPointer(); cast(V)->setSwiftError(true); @@ -5795,7 +5791,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, // FIXME: Avoid the conversion through memory if possible. Address Src = Address::invalid(); if (!I->isAggregate()) { - Src = CreateMemTempWithoutCast(I->Ty, "coerce"); + Src = CreateMemTemp(I->Ty, "coerce"); I->copyInto(*this, Src); } else { Src = I->hasLValue() ? I->getKnownLValue().getAddress() @@ -5952,7 +5948,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, case ABIArgInfo::TargetSpecific: { Address Src = Address::invalid(); if (!I->isAggregate()) { - Src = CreateMemTempWithoutCast(I->Ty, "target_coerce"); + Src = CreateMemTemp(I->Ty, "target_coerce"); I->copyInto(*this, Src); } else { Src = I->hasLValue() ? I->getKnownLValue().getAddress() @@ -6488,7 +6484,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, getContext().getTypeInfoDataSizeInChars(RetTy).Width.getQuantity(); if (!DestPtr.isValid()) { - DestPtr = CreateMemTempWithoutCast(RetTy, "coerce"); + DestPtr = CreateMemTemp(RetTy, "coerce"); DestIsVolatile = false; DestSize = getContext().getTypeSizeInChars(RetTy).getQuantity(); } @@ -6513,7 +6509,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, Address StorePtr = emitAddressAtOffset(*this, DestPtr, RetAI); bool DestIsVolatile = ReturnValue.isVolatile(); if (!DestPtr.isValid()) { - DestPtr = CreateMemTempWithoutCast(RetTy, "target_coerce"); + DestPtr = CreateMemTemp(RetTy, "target_coerce"); DestIsVolatile = false; } CGM.getABIInfo().createCoercedStore(CI, StorePtr, RetAI, DestIsVolatile, diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp index e52c5f6af2851..de11e8bca43f1 100644 --- a/clang/lib/CodeGen/CGClass.cpp +++ b/clang/lib/CodeGen/CGClass.cpp @@ -3190,7 +3190,7 @@ void CodeGenFunction::EmitLambdaStaticInvokeBody(const CXXMethodDecl *MD) { CanQualType LambdaType = getContext().getCanonicalTagType(Lambda); CanQualType ThisType = getContext().getPointerType(LambdaType); - Address ThisPtr = CreateMemTempWithoutCast(LambdaType, "unused.capture"); + Address ThisPtr = CreateMemTemp(LambdaType, "unused.capture"); CallArgs.add(RValue::get(ThisPtr.emitRawPointer(*this)), ThisType); EmitLambdaDelegatingInvokeBody(MD, CallArgs); diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index 7608f8cb6fc7a..63ad0bc9ec238 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -2724,9 +2724,8 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg, UseIndirectDebugAddress = !ArgInfo.getIndirectByVal(); if (UseIndirectDebugAddress) { auto PtrTy = getContext().getPointerType(Ty); - AllocaPtr = CreateMemTempWithoutCast( - PtrTy, getContext().getTypeAlignInChars(PtrTy), - D.getName() + ".indirect_addr"); + AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy), + D.getName() + ".indirect_addr"); EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy); } @@ -2763,7 +2762,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg, DeclPtr = OpenMPLocalAddr; AllocaPtr = DeclPtr; } else { - // Otherwise, create a casted temporary to hold the value. + // Otherwise, create a temporary to hold the value. DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D), D.getName() + ".addr", &AllocaPtr); } diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp index 99dfaa80be429..0576582d34543 100644 --- a/clang/lib/CodeGen/CGException.cpp +++ b/clang/lib/CodeGen/CGException.cpp @@ -2130,7 +2130,7 @@ void CodeGenFunction::EmitSEHExceptionCodeSave(CodeGenFunction &ParentCGF, // On Win64, the info is passed as the first parameter to the filter. SEHInfo = &*CurFn->arg_begin(); SEHCodeSlotStack.push_back( - CreateMemTempWithoutCast(getContext().IntTy, "__exception_code")); + CreateMemTemp(getContext().IntTy, "__exception_code")); } else { // On Win32, the EBP on entry to the filter points to the end of an // exception registration object. It contains 6 32-bit fields, and the info @@ -2204,7 +2204,7 @@ void CodeGenFunction::EnterSEHTryStmt(const SEHTryStmt &S) { assert(Except); EHCatchScope *CatchScope = EHStack.pushCatch(1); SEHCodeSlotStack.push_back( - CreateMemTempWithoutCast(getContext().IntTy, "__exception_code")); + CreateMemTemp(getContext().IntTy, "__exception_code")); // If the filter is known to evaluate to 1, then we can use the clause // "catch i8* null". We can't do this on x86 because the filter has to save diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 0160b353f8f32..325902f2127bc 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -500,10 +500,7 @@ static RawAddress createReferenceTemporary(CodeGenFunction &CGF, // FIXME: Should we put the new global into a COMDAT? return RawAddress(C, GV->getValueType(), alignment); } - RawAddress Addr = CGF.CreateMemTempWithoutCast(Ty, "ref.tmp"); - if (Alloca) - *Alloca = Addr; - return Addr; + return CGF.CreateMemTemp(Ty, "ref.tmp", Alloca); } case SD_Thread: case SD_Static: @@ -1630,7 +1627,7 @@ RValue CodeGenFunction::GetUndefRValue(QualType Ty) { // identifiable address. Just because the contents of the value are undefined // doesn't mean that the address can't be taken and compared. case TEK_Aggregate: { - Address DestPtr = CreateMemTempWithoutCast(Ty, "undef.agg.tmp"); + Address DestPtr = CreateMemTemp(Ty, "undef.agg.tmp"); return RValue::getAggregate(DestPtr); } @@ -5980,7 +5977,7 @@ LValue CodeGenFunction::EmitCompoundLiteralLValue(const CompoundLiteralExpr *E){ // make sure to emit the VLA size. EmitVariablyModifiedType(E->getType()); - Address DeclPtr = CreateMemTempWithoutCast(E->getType(), ".compoundliteral"); + Address DeclPtr = CreateMemTemp(E->getType(), ".compoundliteral"); const Expr *InitExpr = E->getInitializer(); LValue Result = MakeAddrLValue(DeclPtr, E->getType(), AlignmentSource::Decl); diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp index befc2659b2f4c..d20c9d208975c 100644 --- a/clang/lib/CodeGen/CGExprAgg.cpp +++ b/clang/lib/CodeGen/CGExprAgg.cpp @@ -2203,7 +2203,7 @@ void CodeGenFunction::EmitAggExpr(const Expr *E, AggValueSlot Slot) { LValue CodeGenFunction::EmitAggExprToLValue(const Expr *E) { assert(hasAggregateEvaluationKind(E->getType()) && "Invalid argument!"); - Address Temp = CreateMemTempWithoutCast(E->getType()); + Address Temp = CreateMemTemp(E->getType()); LValue LV = MakeAddrLValue(Temp, E->getType()); EmitAggExpr(E, AggValueSlot::forLValue(LV, AggValueSlot::IsNotDestructed, AggValueSlot::DoesNotNeedGCBarriers, diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index 33d76cbda494a..f0f29603accb3 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -1408,7 +1408,7 @@ std::optional CGHLSLRuntime::emitResourceArraySubscriptExpr( // Create a temporary variable for the result, which is either going // to be a single resource instance or a local array of resources (we need to // return an LValue). - RawAddress TmpVar = CGF.CreateMemTempWithoutCast(ResultTy); + RawAddress TmpVar = CGF.CreateMemTemp(ResultTy); if (CGF.EmitLifetimeStart(TmpVar.getPointer())) CGF.pushFullExprCleanup( NormalEHLifetimeMarker, TmpVar); @@ -1541,14 +1541,19 @@ RawAddress CGHLSLRuntime::createBufferMatrixTempAddress(const LValue &LV, "expected cbuffer matrix"); QualType MatQualTy = LV.getType(); + llvm::Type *MemTy = CGF.ConvertTypeForMem(MatQualTy); llvm::Type *LayoutTy = HLSLBufferLayoutBuilder(CGF.CGM).layOutType(MatQualTy); - Address SrcAddr = LV.getAddress(); - if (LayoutTy == CGF.ConvertTypeForMem(MatQualTy)) - return SrcAddr; + if (LayoutTy == MemTy) + return LV.getAddress(); - RawAddress DestAlloca = - CGF.CreateMemTempWithoutCast(MatQualTy, "matrix.buf.copy"); + Address SrcAddr = LV.getAddress(); + // NOTE: B\C CreateMemTemp flattens MatrixTypes which causes + // overlapping GEPs in emitBufferCopy. Use CreateTempAlloca with + // the non-padded layout. + CharUnits Align = + CharUnits::fromQuantity(CGF.CGM.getDataLayout().getABITypeAlign(MemTy)); + RawAddress DestAlloca = CGF.CreateTempAlloca(MemTy, Align, "matrix.buf.copy"); emitBufferCopy(CGF, DestAlloca, SrcAddr, MatQualTy); return DestAlloca; } diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 9548efbeb72cf..ec059f9dfef82 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -919,7 +919,7 @@ static Address castToBase(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy, BaseTy = BaseTy.getNonReferenceType(); while ((BaseTy->isPointerType() || BaseTy->isReferenceType()) && !CGF.getContext().hasSameType(BaseTy, ElTy)) { - Tmp = CGF.CreateMemTempWithoutCast(BaseTy); + Tmp = CGF.CreateMemTemp(BaseTy); if (TopTmp.isValid()) CGF.Builder.CreateStore(Tmp.getPointer(), TopTmp); else @@ -2059,8 +2059,7 @@ Address CGOpenMPRuntime::emitThreadIDAddress(CodeGenFunction &CGF, llvm::Value *ThreadID = getThreadID(CGF, Loc); QualType Int32Ty = CGF.getContext().getIntTypeForBitwidth(/*DestWidth*/ 32, /*Signed*/ true); - Address ThreadIDTemp = - CGF.CreateMemTempWithoutCast(Int32Ty, /*Name*/ ".threadid_temp."); + Address ThreadIDTemp = CGF.CreateMemTemp(Int32Ty, /*Name*/ ".threadid_temp."); CGF.EmitStoreOfScalar(ThreadID, CGF.MakeAddrLValue(ThreadIDTemp, Int32Ty)); @@ -2339,7 +2338,7 @@ void CGOpenMPRuntime::emitSingleRegion(CodeGenFunction &CGF, // int32 did_it = 0; QualType KmpInt32Ty = C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1); - DidIt = CGF.CreateMemTempWithoutCast(KmpInt32Ty, ".omp.copyprivate.did_it"); + DidIt = CGF.CreateMemTemp(KmpInt32Ty, ".omp.copyprivate.did_it"); CGF.Builder.CreateStore(CGF.Builder.getInt32(0), DidIt); } // Prepare arguments and build a call to __kmpc_single @@ -2366,8 +2365,8 @@ void CGOpenMPRuntime::emitSingleRegion(CodeGenFunction &CGF, C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal, /*IndexTypeQuals=*/0); // Create a list of all private variables for copyprivate. - Address CopyprivateList = CGF.CreateMemTempWithoutCast( - CopyprivateArrayTy, ".omp.copyprivate.cpr_list"); + Address CopyprivateList = + CGF.CreateMemTemp(CopyprivateArrayTy, ".omp.copyprivate.cpr_list"); for (unsigned I = 0, E = CopyprivateVars.size(); I < E; ++I) { Address Elem = CGF.Builder.CreateConstArrayGEP(CopyprivateList, I); CGF.Builder.CreateStore( @@ -3994,8 +3993,8 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc, KmpTaskAffinityInfoTy, llvm::APInt(C.getTypeSize(C.getSizeType()), NumAffinities), nullptr, ArraySizeModifier::Normal, /*IndexTypeQuals=*/0); - AffinitiesArray = CGF.CreateMemTempWithoutCast(KmpTaskAffinityInfoArrayTy, - ".affs.arr.addr"); + AffinitiesArray = + CGF.CreateMemTemp(KmpTaskAffinityInfoArrayTy, ".affs.arr.addr"); AffinitiesArray = CGF.Builder.CreateConstArrayGEP(AffinitiesArray, 0); NumOfElements = llvm::ConstantInt::get(CGM.Int32Ty, NumAffinities, /*isSigned=*/false); @@ -4032,7 +4031,7 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc, LValue PosLVal; if (HasIterator) { PosLVal = CGF.MakeAddrLValue( - CGF.CreateMemTempWithoutCast(C.getSizeType(), "affs.counter.addr"), + CGF.CreateMemTemp(C.getSizeType(), "affs.counter.addr"), C.getSizeType()); CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGF.SizeTy, Pos), PosLVal); } @@ -4310,7 +4309,7 @@ SmallVector CGOpenMPRuntime::emitDepobjElementsSizes( std::tie(NumDeps, Base) = getDepobjElements(CGF, DepobjLVal, E->getExprLoc()); LValue NumLVal = CGF.MakeAddrLValue( - CGF.CreateMemTempWithoutCast(C.getUIntPtrType(), "depobj.size.addr"), + CGF.CreateMemTemp(C.getUIntPtrType(), "depobj.size.addr"), C.getUIntPtrType()); CGF.Builder.CreateStore(llvm::ConstantInt::get(CGF.IntPtrTy, 0), NumLVal.getAddress()); @@ -4455,7 +4454,7 @@ std::pair CGOpenMPRuntime::emitDependClause( KmpDependInfoTy, llvm::APInt(/*numBits=*/64, NumDependencies), nullptr, ArraySizeModifier::Normal, /*IndexTypeQuals=*/0); DependenciesArray = - CGF.CreateMemTempWithoutCast(KmpDependInfoArrayTy, ".dep.arr.addr"); + CGF.CreateMemTemp(KmpDependInfoArrayTy, ".dep.arr.addr"); DependenciesArray = CGF.Builder.CreateConstArrayGEP(DependenciesArray, 0); NumOfElements = llvm::ConstantInt::get(CGM.Int32Ty, NumDependencies, /*isSigned=*/false); @@ -4468,8 +4467,7 @@ std::pair CGOpenMPRuntime::emitDependClause( } // Copy regular dependencies with iterators. LValue PosLVal = CGF.MakeAddrLValue( - CGF.CreateMemTempWithoutCast(C.getSizeType(), "dep.counter.addr"), - C.getSizeType()); + CGF.CreateMemTemp(C.getSizeType(), "dep.counter.addr"), C.getSizeType()); CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGF.SizeTy, Pos), PosLVal); for (const OMPTaskDataTy::DependData &Dep : Dependencies) { if (Dep.DepKind == OMPC_DEPEND_depobj || !Dep.IteratorExpr) @@ -4560,7 +4558,7 @@ Address CGOpenMPRuntime::emitDepobjDependClause( LValue PosLVal; if (Dependencies.IteratorExpr) { PosLVal = CGF.MakeAddrLValue( - CGF.CreateMemTempWithoutCast(C.getSizeType(), "iterator.counter.addr"), + CGF.CreateMemTemp(C.getSizeType(), "iterator.counter.addr"), C.getSizeType()); CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGF.SizeTy, Idx), PosLVal, /*IsInit=*/true); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index cb0e7297f1a89..4e6c2aac0d17a 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1374,7 +1374,7 @@ void CGOpenMPRuntimeGPU::emitCriticalRegion( // Initialize the counter variable for the loop. QualType Int32Ty = CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0); - Address Counter = CGF.CreateMemTempWithoutCast(Int32Ty, "critical_counter"); + Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter"); LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty); CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal, /*isInit=*/true); @@ -1436,7 +1436,7 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, if (CastTy->isIntegerType() && ValTy->isIntegerType()) return CGF.Builder.CreateIntCast(Val, LLVMCastTy, CastTy->hasSignedIntegerRepresentation()); - Address CastItem = CGF.CreateMemTempWithoutCast(CastTy); + Address CastItem = CGF.CreateMemTemp(CastTy); Address ValCastItem = CastItem.withElementType(Val->getType()); CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy, LValueBaseInfo(AlignmentSource::Type), diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index b70667d04d1f6..71f88cdf58954 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -609,7 +609,7 @@ CodeGenFunction::EmitCompoundStmtWithoutScope(const CompoundStmt &S, // We can't return an RValue here because there might be cleanups at // the end of the StmtExpr. Because of that, we have to emit the result // here into a temporary alloca. - RetAlloca = CreateMemTempWithoutCast(ExprTy); + RetAlloca = CreateMemTemp(ExprTy); EmitAnyExprToMem(E, RetAlloca, Qualifiers(), /*IsInit*/ false); } diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 1eaf8efa142c5..82307d3a064c6 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -426,7 +426,7 @@ void CodeGenFunction::GenerateOpenMPCapturedVars( // and load it as a void pointer. if (!CurField->getType()->isAnyPointerType()) { ASTContext &Ctx = getContext(); - Address DstAddr = CreateMemTempWithoutCast( + Address DstAddr = CreateMemTemp( Ctx.getUIntPtrType(), Twine(CurCap->getCapturedVar()->getName(), ".casted")); LValue DstLV = MakeAddrLValue(DstAddr, Ctx.getUIntPtrType()); @@ -5352,7 +5352,7 @@ void CodeGenFunction::EmitOMPTaskBasedDirective( ParamTypes.push_back(PrivatesPtr->getType()); for (const Expr *E : Data.PrivateVars) { const auto *VD = cast(cast(E)->getDecl()); - RawAddress PrivatePtr = CGF.CreateMemTempWithoutCast( + RawAddress PrivatePtr = CGF.CreateMemTemp( CGF.getContext().getPointerType(E->getType()), ".priv.ptr.addr"); PrivatePtrs.emplace_back(VD, PrivatePtr); CallArgs.push_back(PrivatePtr.getPointer()); @@ -5360,9 +5360,9 @@ void CodeGenFunction::EmitOMPTaskBasedDirective( } for (const Expr *E : Data.FirstprivateVars) { const auto *VD = cast(cast(E)->getDecl()); - RawAddress PrivatePtr = CGF.CreateMemTempWithoutCast( - CGF.getContext().getPointerType(E->getType()), - ".firstpriv.ptr.addr"); + RawAddress PrivatePtr = + CGF.CreateMemTemp(CGF.getContext().getPointerType(E->getType()), + ".firstpriv.ptr.addr"); PrivatePtrs.emplace_back(VD, PrivatePtr); FirstprivatePtrs.emplace_back(VD, PrivatePtr); CallArgs.push_back(PrivatePtr.getPointer()); @@ -5370,9 +5370,9 @@ void CodeGenFunction::EmitOMPTaskBasedDirective( } for (const Expr *E : Data.LastprivateVars) { const auto *VD = cast(cast(E)->getDecl()); - RawAddress PrivatePtr = CGF.CreateMemTempWithoutCast( - CGF.getContext().getPointerType(E->getType()), - ".lastpriv.ptr.addr"); + RawAddress PrivatePtr = + CGF.CreateMemTemp(CGF.getContext().getPointerType(E->getType()), + ".lastpriv.ptr.addr"); PrivatePtrs.emplace_back(VD, PrivatePtr); CallArgs.push_back(PrivatePtr.getPointer()); ParamTypes.push_back(PrivatePtr.getType()); @@ -5383,7 +5383,7 @@ void CodeGenFunction::EmitOMPTaskBasedDirective( Ty = CGF.getContext().getPointerType(Ty); if (isAllocatableDecl(VD)) Ty = CGF.getContext().getPointerType(Ty); - RawAddress PrivatePtr = CGF.CreateMemTempWithoutCast( + RawAddress PrivatePtr = CGF.CreateMemTemp( CGF.getContext().getPointerType(Ty), ".local.ptr.addr"); auto Result = UntiedLocalVars.insert( std::make_pair(VD, std::make_pair(PrivatePtr, Address::invalid()))); @@ -5674,9 +5674,9 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective( ParamTypes.push_back(PrivatesPtr->getType()); for (const Expr *E : Data.FirstprivateVars) { const auto *VD = cast(cast(E)->getDecl()); - RawAddress PrivatePtr = CGF.CreateMemTempWithoutCast( - CGF.getContext().getPointerType(E->getType()), - ".firstpriv.ptr.addr"); + RawAddress PrivatePtr = + CGF.CreateMemTemp(CGF.getContext().getPointerType(E->getType()), + ".firstpriv.ptr.addr"); PrivatePtrs.emplace_back(VD, PrivatePtr); CallArgs.push_back(PrivatePtr.getPointer()); ParamTypes.push_back(PrivatePtr.getType()); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index cb9afc3d9928f..777017f90d428 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -2960,13 +2960,10 @@ class CodeGenFunction : public CodeGenTypeCache { /// aggregate type. AggValueSlot CreateAggTemp(QualType T, const Twine &Name = "tmp", RawAddress *Alloca = nullptr) { - RawAddress Addr = CreateMemTempWithoutCast(T, Name); - if (Alloca) - *Alloca = Addr; return AggValueSlot::forAddr( - Addr, T.getQualifiers(), AggValueSlot::IsNotDestructed, - AggValueSlot::DoesNotNeedGCBarriers, AggValueSlot::IsNotAliased, - AggValueSlot::DoesNotOverlap); + CreateMemTemp(T.getUnqualifiedType(), Name, Alloca), T.getQualifiers(), + AggValueSlot::IsNotDestructed, AggValueSlot::DoesNotNeedGCBarriers, + AggValueSlot::IsNotAliased, AggValueSlot::DoesNotOverlap); } /// EvaluateExprAsBool - Perform the usual unary conversions on the specified diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp index 6d8c710c9fe4b..9b444206e8a3d 100644 --- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp +++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp @@ -4513,7 +4513,7 @@ void MicrosoftCXXABI::emitThrow(CodeGenFunction &CGF, const CXXThrowExpr *E) { QualType ThrowType = SubExpr->getType(); // The exception object lives on the stack and it's address is passed to the // runtime function. - Address AI = CGF.CreateMemTempWithoutCast(ThrowType); + Address AI = CGF.CreateMemTemp(ThrowType); CGF.EmitAnyExprToMem(SubExpr, AI, ThrowType.getQualifiers(), /*IsInit=*/true); diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index b7c7bc8ebf9a0..104c36679515e 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -2382,7 +2382,7 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, : Intrinsic::arm_strexd); llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty); - Address Tmp = CreateMemTempWithoutCast(E->getArg(0)->getType()); + Address Tmp = CreateMemTemp(E->getArg(0)->getType()); Value *Val = EmitScalarExpr(E->getArg(0)); Builder.CreateStore(Val, Tmp); @@ -4768,7 +4768,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, : Intrinsic::aarch64_stxp); llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty); - Address Tmp = CreateMemTempWithoutCast(E->getArg(0)->getType()); + Address Tmp = CreateMemTemp(E->getArg(0)->getType()); EmitAnyExprToMem(E->getArg(0), Tmp, Qualifiers(), /*init*/ true); Tmp = Tmp.withElementType(STy); diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index ee1ccd83e3aa2..9645ed87b8ef3 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -1024,14 +1024,14 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, } case X86::BI_mm_setcsr: case X86::BI__builtin_ia32_ldmxcsr: { - RawAddress Tmp = CreateMemTempWithoutCast(E->getArg(0)->getType()); + RawAddress Tmp = CreateMemTemp(E->getArg(0)->getType()); Builder.CreateStore(Ops[0], Tmp); return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr), Tmp.getPointer()); } case X86::BI_mm_getcsr: case X86::BI__builtin_ia32_stmxcsr: { - RawAddress Tmp = CreateMemTempWithoutCast(E->getType()); + RawAddress Tmp = CreateMemTemp(E->getType()); Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr), Tmp.getPointer()); return Builder.CreateLoad(Tmp, "stmxcsr"); diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp index 61ab591f55be9..4a57ca7767bd2 100644 --- a/clang/lib/CodeGen/Targets/X86.cpp +++ b/clang/lib/CodeGen/Targets/X86.cpp @@ -3168,7 +3168,7 @@ RValue X86_64ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, // FIXME: Cleanup. assert(AI.isDirect() && "Unexpected ABI info for mixed regs"); llvm::StructType *ST = cast(AI.getCoerceToType()); - Address Tmp = CGF.CreateMemTempWithoutCast(Ty); + Address Tmp = CGF.CreateMemTemp(Ty); Tmp = Tmp.withElementType(ST); assert(ST->getNumElements() == 2 && "Unexpected ABI info for mixed regs"); llvm::Type *TyLo = ST->getElementType(0); @@ -3228,7 +3228,7 @@ RValue X86_64ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, // The stored size of this structure is smaller than its actual size, // which may lead to reading past the end of the register save area. if (CoTy && (AI.getDirectOffset() == 8 || RegSize < TySize)) { - Address Tmp = CGF.CreateMemTempWithoutCast(Ty); + Address Tmp = CGF.CreateMemTemp(Ty); llvm::Value *Addr = CGF.Builder.CreateGEP(CGF.Int8Ty, RegSaveArea, GpOrFpOffset); llvm::Value *Src = CGF.Builder.CreateAlignedLoad(CoTy, Addr, TyAlign); @@ -3247,7 +3247,7 @@ RValue X86_64ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, // Copy into a temporary if the type is more aligned than the // register save area. if (neededInt && TyAlign.getQuantity() > 8) { - Address Tmp = CGF.CreateMemTempWithoutCast(Ty); + Address Tmp = CGF.CreateMemTemp(Ty); CGF.Builder.CreateMemCpy(Tmp, RegAddr, TySize, false); RegAddr = Tmp; } @@ -3271,7 +3271,7 @@ RValue X86_64ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, ? AI.getCoerceToType() : llvm::StructType::get(CGF.DoubleTy, CGF.DoubleTy); llvm::Value *V; - Address Tmp = CGF.CreateMemTempWithoutCast(Ty); + Address Tmp = CGF.CreateMemTemp(Ty); Tmp = Tmp.withElementType(ST); V = CGF.Builder.CreateLoad( RegAddrLo.withElementType(ST->getStructElementType(0))); diff --git a/clang/test/CodeGen/scoped-atomic-ops.c b/clang/test/CodeGen/scoped-atomic-ops.c index 686190f9ef947..16b2b459e2cb2 100644 --- a/clang/test/CodeGen/scoped-atomic-ops.c +++ b/clang/test/CodeGen/scoped-atomic-ops.c @@ -140,41 +140,47 @@ int fi1a(int *i) { // AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP5]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4 -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP1]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[TMP3]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("agent") monotonic, align 4 -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[ATOMIC_TEMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP5]], ptr [[ATOMIC_TEMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP6]], ptr [[TMP7]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4 -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP9]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP9]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP10]], ptr [[TMP11]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("cluster") monotonic, align 4 -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr addrspace(5) [[ATOMIC_TEMP3]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP3]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[ATOMIC_TEMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP14]], ptr [[TMP15]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("wavefront") monotonic, align 4 -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4 -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP21]], ptr addrspace(5) [[ATOMIC_TEMP5]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP5]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP21]], ptr [[ATOMIC_TEMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[TMP23]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 @@ -191,41 +197,47 @@ int fi1a(int *i) { // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP5]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4 -// AMDGCN_CL_20-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP1]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[TMP3]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("agent") monotonic, align 4 -// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[ATOMIC_TEMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP1]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr [[ATOMIC_TEMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP6]], ptr [[TMP7]], align 4 // AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4 -// AMDGCN_CL_20-NEXT: store i32 [[TMP9]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP9]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP10]], ptr [[TMP11]], align 4 // AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("cluster") monotonic, align 4 -// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr addrspace(5) [[ATOMIC_TEMP3]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP3]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[ATOMIC_TEMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP14]], ptr [[TMP15]], align 4 // AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("wavefront") monotonic, align 4 -// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4 -// AMDGCN_CL_20-NEXT: store i32 [[TMP21]], ptr addrspace(5) [[ATOMIC_TEMP5]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP5]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP21]], ptr [[ATOMIC_TEMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[TMP23]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 @@ -481,30 +493,36 @@ void fi2a(int *i) { // AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP4:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP2]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP2]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4 // AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4 // AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP4]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4 // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 // AMDGCN_CL_DEF-NEXT: ret void // @@ -518,30 +536,36 @@ void fi2a(int *i) { // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP4:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP2]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP2]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4 // AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4 // AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP4]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP4]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4 // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 // AMDGCN_CL_20-NEXT: ret void // @@ -663,6 +687,22 @@ void fi2b(int *i) { // AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 @@ -672,67 +712,67 @@ void fi2b(int *i) { // AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_DEF-NEXT: ret void @@ -764,6 +804,22 @@ void fi2b(int *i) { // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8 @@ -773,67 +829,67 @@ void fi2b(int *i) { // AMDGCN_CL_20-NEXT: store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 // AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 // AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 // AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_20-NEXT: ret void @@ -1086,6 +1142,22 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 @@ -1095,67 +1167,67 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_DEF-NEXT: ret void @@ -1187,6 +1259,22 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8 @@ -1196,67 +1284,67 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 // AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 // AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 // AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_20-NEXT: ret void @@ -1509,6 +1597,22 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 @@ -1518,67 +1622,67 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_DEF-NEXT: ret void @@ -1610,6 +1714,22 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8 @@ -1619,67 +1739,67 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 // AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 // AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 // AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_20-NEXT: ret void @@ -1932,6 +2052,22 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 @@ -1941,67 +2077,67 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_DEF-NEXT: ret void @@ -2033,6 +2169,22 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8 @@ -2042,67 +2194,67 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 // AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 // AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 // AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_20-NEXT: ret void @@ -2355,6 +2507,22 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 @@ -2364,67 +2532,67 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_DEF-NEXT: ret void @@ -2456,6 +2624,22 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8 @@ -2465,67 +2649,67 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_20-NEXT: store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 // AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 // AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 // AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_20-NEXT: ret void @@ -2778,6 +2962,22 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 @@ -2787,67 +2987,67 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_DEF-NEXT: ret void @@ -2879,6 +3079,22 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8 @@ -2888,67 +3104,67 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: store ptr [[G]], ptr addrspace(5) [[G_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[H]], ptr addrspace(5) [[H_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr addrspace(5) [[D_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP7]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP8]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[E_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 // AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP9]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP10]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr addrspace(5) [[F_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 // AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP11]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP12]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr addrspace(5) [[G_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 // AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP13]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP14]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr addrspace(5) [[H_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_20-NEXT: ret void @@ -3176,6 +3392,7 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr // AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4 @@ -3191,8 +3408,8 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -3203,6 +3420,7 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 // AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4 @@ -3219,8 +3437,8 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8 -// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP7]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -3296,6 +3514,7 @@ _Bool fi4a(int *i) { // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr // AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4 @@ -3311,8 +3530,8 @@ _Bool fi4a(int *i) { // AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -3323,6 +3542,7 @@ _Bool fi4a(int *i) { // AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 // AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4 @@ -3339,8 +3559,8 @@ _Bool fi4a(int *i) { // AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8 -// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP7]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -3416,6 +3636,7 @@ _Bool fi4b(int *i) { // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr // AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4 @@ -3431,8 +3652,8 @@ _Bool fi4b(int *i) { // AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -3443,6 +3664,7 @@ _Bool fi4b(int *i) { // AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 // AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4 @@ -3459,8 +3681,8 @@ _Bool fi4b(int *i) { // AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8 -// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP7]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -3536,6 +3758,7 @@ _Bool fi4c(int *i) { // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr // AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4 @@ -3551,8 +3774,8 @@ _Bool fi4c(int *i) { // AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -3563,6 +3786,7 @@ _Bool fi4c(int *i) { // AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 // AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4 @@ -3579,8 +3803,8 @@ _Bool fi4c(int *i) { // AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8 -// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP7]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -3656,6 +3880,7 @@ _Bool fi4_clustr(int *i) { // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr // AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4 @@ -3671,8 +3896,8 @@ _Bool fi4_clustr(int *i) { // AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -3683,6 +3908,7 @@ _Bool fi4_clustr(int *i) { // AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 // AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4 @@ -3699,8 +3925,8 @@ _Bool fi4_clustr(int *i) { // AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8 -// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP7]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -3776,6 +4002,7 @@ _Bool fi4d(int *i) { // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr // AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4 @@ -3791,8 +4018,8 @@ _Bool fi4d(int *i) { // AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -3803,6 +4030,7 @@ _Bool fi4d(int *i) { // AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 // AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4 @@ -3819,8 +4047,8 @@ _Bool fi4d(int *i) { // AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8 -// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP7]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -3895,12 +4123,14 @@ _Bool fi4e(int *i) { // AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 @@ -3910,8 +4140,8 @@ _Bool fi4e(int *i) { // AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -3922,12 +4152,14 @@ _Bool fi4e(int *i) { // AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 @@ -3937,8 +4169,8 @@ _Bool fi4e(int *i) { // AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -4012,12 +4244,14 @@ _Bool fi5a(int *i) { // AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 @@ -4027,8 +4261,8 @@ _Bool fi5a(int *i) { // AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -4039,12 +4273,14 @@ _Bool fi5a(int *i) { // AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 @@ -4054,8 +4290,8 @@ _Bool fi5a(int *i) { // AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -4129,12 +4365,14 @@ _Bool fi5b(int *i) { // AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 @@ -4144,8 +4382,8 @@ _Bool fi5b(int *i) { // AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -4156,12 +4394,14 @@ _Bool fi5b(int *i) { // AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 @@ -4171,8 +4411,8 @@ _Bool fi5b(int *i) { // AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -4245,12 +4485,14 @@ _Bool fi5c(int *i) { // AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 @@ -4260,8 +4502,8 @@ _Bool fi5c(int *i) { // AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -4272,12 +4514,14 @@ _Bool fi5c(int *i) { // AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 @@ -4287,8 +4531,8 @@ _Bool fi5c(int *i) { // AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -4361,12 +4605,14 @@ _Bool fi5_clustr(int *i) { // AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 @@ -4376,8 +4622,8 @@ _Bool fi5_clustr(int *i) { // AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -4388,12 +4634,14 @@ _Bool fi5_clustr(int *i) { // AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 @@ -4403,8 +4651,8 @@ _Bool fi5_clustr(int *i) { // AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -4477,12 +4725,14 @@ _Bool fi5d(int *i) { // AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 @@ -4492,8 +4742,8 @@ _Bool fi5d(int *i) { // AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -4504,12 +4754,14 @@ _Bool fi5d(int *i) { // AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[I]], ptr addrspace(5) [[I_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[I_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 @@ -4519,8 +4771,8 @@ _Bool fi5d(int *i) { // AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] // AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: // AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 -// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr addrspace(5) [[CMPXCHG_BOOL]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP6]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -5035,13 +5287,15 @@ int fi6e(int *c, int *d) { // AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -5051,13 +5305,15 @@ int fi6e(int *c, int *d) { // AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1 +// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -5105,13 +5361,15 @@ _Bool fi7a(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -5121,13 +5379,15 @@ _Bool fi7a(_Bool *c) { // AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1 +// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -5175,13 +5435,15 @@ _Bool fi7b(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -5191,13 +5453,15 @@ _Bool fi7b(_Bool *c) { // AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1 +// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -5245,13 +5509,15 @@ _Bool fi7c(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -5261,13 +5527,15 @@ _Bool fi7c(_Bool *c) { // AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1 +// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -5315,13 +5583,15 @@ _Bool fi7_clustr(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -5331,13 +5601,15 @@ _Bool fi7_clustr(_Bool *c) { // AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1 +// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -5385,13 +5657,15 @@ _Bool fi7d(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1 +// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0 // AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] // @@ -5401,13 +5675,15 @@ _Bool fi7d(_Bool *c) { // AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[C]], ptr addrspace(5) [[C_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i8 1, ptr addrspace(5) [[DOTATOMICTMP]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(5) [[DOTATOMICTMP]], align 1 +// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[ATOMIC_TEMP]], align 1 +// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = icmp ne i8 [[TMP3]], 0 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // @@ -5459,22 +5735,26 @@ _Bool fi7e(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr // AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 -1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 -1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// AMDGCN_CL_DEF-NEXT: store i32 -1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 -1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] -// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_DEF-NEXT: ret void @@ -5488,22 +5768,26 @@ _Bool fi7e(_Bool *c) { // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) // AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr // AMDGCN_CL_20-NEXT: store ptr [[A]], ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store ptr [[B]], ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 -1, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 -1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 -// AMDGCN_CL_20-NEXT: store i32 -1, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// AMDGCN_CL_20-NEXT: store i32 -1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr addrspace(5) [[A_ADDR]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_20-NEXT: ret void diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu index 092d509b292fc..bf45a353851b4 100644 --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -20,9 +20,10 @@ // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(5) [[X]], align 8 -// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr addrspace(5) [[X]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8 // CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 0 @@ -32,13 +33,14 @@ // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi( -// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META4:![0-9]+]] { +// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) // CHECK-SPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) -// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X]], align 8 -// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8 +// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 // CHECK-SPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 // CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 // CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 0 @@ -56,7 +58,7 @@ // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi( -// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META4:![0-9]+]] { +// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64 // OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) @@ -88,26 +90,28 @@ __global__ void kernel1(int *x) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(5) [[X]], align 8 -// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr addrspace(5) [[X]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8 // CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8, !nonnull [[META4:![0-9]+]], !align [[META5:![0-9]+]] // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 // CHECK-NEXT: store i32 [[INC]], ptr [[TMP0]], align 4 // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri( -// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) // CHECK-SPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) -// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X]], align 8 -// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8 +// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 // CHECK-SPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 -// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8, !align [[META5:![0-9]+]] +// CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8, !align [[META6:![0-9]+]] // CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4 // CHECK-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 // CHECK-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[TMP0]], align 4 @@ -122,7 +126,7 @@ __global__ void kernel1(int *x) { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri( -// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// OPT-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64 // OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) @@ -167,7 +171,7 @@ __global__ void kernel2(int &x) { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i( -// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(2), align 8 // CHECK-SPIRV-NEXT: [[Y_ADDR:%.*]] = alloca ptr addrspace(1), align 8 @@ -191,7 +195,7 @@ __global__ void kernel2(int &x) { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i( -// OPT-SPIRV-SAME: ptr addrspace(2) noundef readonly captures(none) [[X:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] !max_work_group_size [[META4]] { +// OPT-SPIRV-SAME: ptr addrspace(2) noundef readonly captures(none) [[X:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1:[0-9]+]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(2) [[X]], align 4 // OPT-SPIRV-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[Y]], align 4 @@ -257,7 +261,7 @@ __global__ void kernel3(__attribute__((address_space(2))) int *x, // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_func void @_Z4funcPi( -// OPT-SPIRV-SAME: ptr addrspace(4) noundef captures(none) [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR1]] { +// OPT-SPIRV-SAME: ptr addrspace(4) noundef captures(none) [[X:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2:[0-9]+]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[X]], align 4 // OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1 @@ -280,16 +284,16 @@ struct S { // CHECK-LABEL: define dso_local amdgpu_kernel void @_Z7kernel41S( // CHECK-SAME: ptr addrspace(4) noundef byref([[STRUCT_S:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[S:%.*]] = alloca [[STRUCT_S]], align 8, addrspace(5) -// CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[S]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) -// CHECK-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S]] to ptr -// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[S_ASCAST]], i32 0, i32 0 +// CHECK-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_S]], align 8, addrspace(5) +// CHECK-NEXT: [[S:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// CHECK-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[S]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) +// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[S]], i32 0, i32 0 // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[X]], align 8 // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 0 // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 // CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 // CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[S_ASCAST]], i32 0, i32 1 +// CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[S]], i32 0, i32 1 // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[Y]], align 8 // CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 0 // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 @@ -298,18 +302,18 @@ struct S { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S( -// CHECK-SPIRV-SAME: ptr addrspace(2) noundef byref([[STRUCT_S:%.*]]) align 8 [[TMP0:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// CHECK-SPIRV-SAME: ptr addrspace(2) noundef byref([[STRUCT_S:%.*]]) align 8 [[TMP0:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] -// CHECK-SPIRV-NEXT: [[S:%.*]] = alloca [[STRUCT_S]], align 8 -// CHECK-SPIRV-NEXT: call addrspace(4) void @llvm.memcpy.p0.p2.i64(ptr align 8 [[S]], ptr addrspace(2) align 8 [[TMP0]], i64 16, i1 false) -// CHECK-SPIRV-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(4) -// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[S_ASCAST]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_S]], align 8 +// CHECK-SPIRV-NEXT: [[S:%.*]] = addrspacecast ptr [[COERCE]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: call addrspace(4) void @llvm.memcpy.p4.p2.i64(ptr addrspace(4) align 8 [[S]], ptr addrspace(2) align 8 [[TMP0]], i64 16, i1 false) +// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[S]], i32 0, i32 0 // CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X]], align 8 // CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 0 // CHECK-SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX]], align 4 // CHECK-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 // CHECK-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[ARRAYIDX]], align 4 -// CHECK-SPIRV-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[S_ASCAST]], i32 0, i32 1 +// CHECK-SPIRV-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr addrspace(4) [[S]], i32 0, i32 1 // CHECK-SPIRV-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[Y]], align 8 // CHECK-SPIRV-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP3]], i64 0 // CHECK-SPIRV-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(4) [[ARRAYIDX1]], align 4 @@ -320,12 +324,12 @@ struct S { // OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel41S( // OPT-SAME: ptr addrspace(4) noundef readonly byref([[STRUCT_S:%.*]]) align 8 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { // OPT-NEXT: [[ENTRY:.*:]] -// OPT-NEXT: [[S_SROA_0_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[TMP0]], align 8, !amdgpu.noclobber [[META3:![0-9]+]] -// OPT-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[S_SROA_0_0_COPYLOAD]] to ptr addrspace(1) -// OPT-NEXT: [[S_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 8 -// OPT-NEXT: [[S_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[S_SROA_2_0__SROA_IDX]], align 8, !amdgpu.noclobber [[META3]] -// OPT-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[S_SROA_2_0_COPYLOAD]] to ptr addrspace(1) -// OPT-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4, !amdgpu.noclobber [[META3]] +// OPT-NEXT: [[COERCE_SROA_0_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[TMP0]], align 8, !amdgpu.noclobber [[META4:![0-9]+]] +// OPT-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[COERCE_SROA_0_0_COPYLOAD]] to ptr addrspace(1) +// OPT-NEXT: [[COERCE_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 8 +// OPT-NEXT: [[COERCE_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[COERCE_SROA_2_0__SROA_IDX]], align 8, !amdgpu.noclobber [[META4]] +// OPT-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[COERCE_SROA_2_0_COPYLOAD]] to ptr addrspace(1) +// OPT-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4, !amdgpu.noclobber [[META4]] // OPT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP3]], 1 // OPT-NEXT: store i32 [[INC]], ptr addrspace(1) [[TMP1]], align 4 // OPT-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(1) [[TMP2]], align 4 @@ -334,17 +338,17 @@ struct S { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S( -// OPT-SPIRV-SAME: ptr addrspace(2) noundef readonly byref([[STRUCT_S:%.*]]) align 8 captures(none) [[TMP0:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// OPT-SPIRV-SAME: ptr addrspace(2) noundef readonly byref([[STRUCT_S:%.*]]) align 8 captures(none) [[TMP0:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] -// OPT-SPIRV-NEXT: [[S_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[TMP0]], align 8 -// OPT-SPIRV-NEXT: [[S_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(2) [[TMP0]], i64 8 -// OPT-SPIRV-NEXT: [[S_SROA_2_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[S_SROA_2_0__SROA_IDX]], align 8 -// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[S_SROA_0_0_COPYLOAD]], align 4 +// OPT-SPIRV-NEXT: [[COERCE_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[TMP0]], align 8 +// OPT-SPIRV-NEXT: [[COERCE_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(2) [[TMP0]], i64 8 +// OPT-SPIRV-NEXT: [[COERCE_SROA_2_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[COERCE_SROA_2_0__SROA_IDX]], align 8 +// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[COERCE_SROA_0_0_COPYLOAD]], align 4 // OPT-SPIRV-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 -// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[S_SROA_0_0_COPYLOAD]], align 4 -// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[S_SROA_2_0_COPYLOAD]], align 4 +// OPT-SPIRV-NEXT: store i32 [[INC]], ptr addrspace(4) [[COERCE_SROA_0_0_COPYLOAD]], align 4 +// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[COERCE_SROA_2_0_COPYLOAD]], align 4 // OPT-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP2]], 1.000000e+00 -// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[S_SROA_2_0_COPYLOAD]], align 4 +// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[COERCE_SROA_2_0_COPYLOAD]], align 4 // OPT-SPIRV-NEXT: ret void // // HOST-LABEL: define dso_local void @_Z22__device_stub__kernel41S( @@ -376,9 +380,10 @@ __global__ void kernel4(struct S s) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[S:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[S_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S]] to ptr // CHECK-NEXT: [[S_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S_ADDR]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[S_COERCE]], ptr addrspace(5) [[S]], align 8 -// CHECK-NEXT: [[S1:%.*]] = load ptr, ptr addrspace(5) [[S]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[S_COERCE]], ptr [[S_ASCAST]], align 8 +// CHECK-NEXT: [[S1:%.*]] = load ptr, ptr [[S_ASCAST]], align 8 // CHECK-NEXT: store ptr [[S1]], ptr [[S_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP0]], i32 0, i32 0 @@ -397,13 +402,14 @@ __global__ void kernel4(struct S s) { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S( -// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[S:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-SPIRV-NEXT: [[S_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[S_ASCAST:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(4) // CHECK-SPIRV-NEXT: [[S_ADDR_ASCAST:%.*]] = addrspacecast ptr [[S_ADDR]] to ptr addrspace(4) -// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[S_COERCE]], ptr [[S]], align 8 -// CHECK-SPIRV-NEXT: [[S1:%.*]] = load ptr addrspace(4), ptr [[S]], align 8 +// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[S_COERCE]], ptr addrspace(4) [[S_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[S1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[S_ASCAST]], align 8 // CHECK-SPIRV-NEXT: store ptr addrspace(4) [[S1]], ptr addrspace(4) [[S_ADDR_ASCAST]], align 8 // CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[S_ADDR_ASCAST]], align 8 // CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr addrspace(4) [[TMP0]], i32 0, i32 0 @@ -436,7 +442,7 @@ __global__ void kernel4(struct S s) { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S( -// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// OPT-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[S_COERCE]] to i64 // OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) @@ -481,17 +487,17 @@ struct T { // CHECK-LABEL: define dso_local amdgpu_kernel void @_Z7kernel61T( // CHECK-SAME: ptr addrspace(4) noundef byref([[STRUCT_T:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[T:%.*]] = alloca [[STRUCT_T]], align 8, addrspace(5) -// CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 8 [[T]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) -// CHECK-NEXT: [[T_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[T]] to ptr -// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr [[T_ASCAST]], i32 0, i32 0 +// CHECK-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_T]], align 8, addrspace(5) +// CHECK-NEXT: [[T:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// CHECK-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[T]], ptr addrspace(4) align 8 [[TMP0]], i64 16, i1 false) +// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr [[T]], i32 0, i32 0 // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x ptr], ptr [[X]], i64 0, i64 0 // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 // CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 0 // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 // CHECK-NEXT: [[ADD:%.*]] = fadd contract float [[TMP2]], 1.000000e+00 // CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX1]], align 4 -// CHECK-NEXT: [[X2:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr [[T_ASCAST]], i32 0, i32 0 +// CHECK-NEXT: [[X2:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr [[T]], i32 0, i32 0 // CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x ptr], ptr [[X2]], i64 0, i64 1 // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX3]], align 8 // CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 0 @@ -501,19 +507,19 @@ struct T { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T( -// CHECK-SPIRV-SAME: ptr addrspace(2) noundef byref([[STRUCT_T:%.*]]) align 8 [[TMP0:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// CHECK-SPIRV-SAME: ptr addrspace(2) noundef byref([[STRUCT_T:%.*]]) align 8 [[TMP0:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] -// CHECK-SPIRV-NEXT: [[T:%.*]] = alloca [[STRUCT_T]], align 8 -// CHECK-SPIRV-NEXT: call addrspace(4) void @llvm.memcpy.p0.p2.i64(ptr align 8 [[T]], ptr addrspace(2) align 8 [[TMP0]], i64 16, i1 false) -// CHECK-SPIRV-NEXT: [[T_ASCAST:%.*]] = addrspacecast ptr [[T]] to ptr addrspace(4) -// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr addrspace(4) [[T_ASCAST]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_T]], align 8 +// CHECK-SPIRV-NEXT: [[T:%.*]] = addrspacecast ptr [[COERCE]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: call addrspace(4) void @llvm.memcpy.p4.p2.i64(ptr addrspace(4) align 8 [[T]], ptr addrspace(2) align 8 [[TMP0]], i64 16, i1 false) +// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr addrspace(4) [[T]], i32 0, i32 0 // CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x ptr addrspace(4)], ptr addrspace(4) [[X]], i64 0, i64 0 // CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ARRAYIDX]], align 8 // CHECK-SPIRV-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP1]], i64 0 // CHECK-SPIRV-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[ARRAYIDX1]], align 4 // CHECK-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP2]], 1.000000e+00 // CHECK-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[ARRAYIDX1]], align 4 -// CHECK-SPIRV-NEXT: [[X2:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr addrspace(4) [[T_ASCAST]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: [[X2:%.*]] = getelementptr inbounds nuw [[STRUCT_T]], ptr addrspace(4) [[T]], i32 0, i32 0 // CHECK-SPIRV-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x ptr addrspace(4)], ptr addrspace(4) [[X2]], i64 0, i64 1 // CHECK-SPIRV-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ARRAYIDX3]], align 8 // CHECK-SPIRV-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr addrspace(4) [[TMP3]], i64 0 @@ -525,12 +531,12 @@ struct T { // OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel61T( // OPT-SAME: ptr addrspace(4) noundef readonly byref([[STRUCT_T:%.*]]) align 8 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { // OPT-NEXT: [[ENTRY:.*:]] -// OPT-NEXT: [[T_SROA_0_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[TMP0]], align 8, !amdgpu.noclobber [[META3]] -// OPT-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[T_SROA_0_0_COPYLOAD]] to ptr addrspace(1) -// OPT-NEXT: [[T_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 8 -// OPT-NEXT: [[T_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[T_SROA_2_0__SROA_IDX]], align 8, !amdgpu.noclobber [[META3]] -// OPT-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[T_SROA_2_0_COPYLOAD]] to ptr addrspace(1) -// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(1) [[TMP1]], align 4, !amdgpu.noclobber [[META3]] +// OPT-NEXT: [[COERCE_SROA_0_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[TMP0]], align 8, !amdgpu.noclobber [[META4]] +// OPT-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[COERCE_SROA_0_0_COPYLOAD]] to ptr addrspace(1) +// OPT-NEXT: [[COERCE_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP0]], i64 8 +// OPT-NEXT: [[COERCE_SROA_2_0_COPYLOAD:%.*]] = load ptr, ptr addrspace(4) [[COERCE_SROA_2_0__SROA_IDX]], align 8, !amdgpu.noclobber [[META4]] +// OPT-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[COERCE_SROA_2_0_COPYLOAD]] to ptr addrspace(1) +// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(1) [[TMP1]], align 4, !amdgpu.noclobber [[META4]] // OPT-NEXT: [[ADD:%.*]] = fadd contract float [[TMP3]], 1.000000e+00 // OPT-NEXT: store float [[ADD]], ptr addrspace(1) [[TMP1]], align 4 // OPT-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(1) [[TMP2]], align 4 @@ -539,17 +545,17 @@ struct T { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T( -// OPT-SPIRV-SAME: ptr addrspace(2) noundef readonly byref([[STRUCT_T:%.*]]) align 8 captures(none) [[TMP0:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// OPT-SPIRV-SAME: ptr addrspace(2) noundef readonly byref([[STRUCT_T:%.*]]) align 8 captures(none) [[TMP0:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] -// OPT-SPIRV-NEXT: [[T_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[TMP0]], align 8 -// OPT-SPIRV-NEXT: [[T_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(2) [[TMP0]], i64 8 -// OPT-SPIRV-NEXT: [[T_SROA_2_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[T_SROA_2_0__SROA_IDX]], align 8 -// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[T_SROA_0_0_COPYLOAD]], align 4 +// OPT-SPIRV-NEXT: [[COERCE_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[TMP0]], align 8 +// OPT-SPIRV-NEXT: [[COERCE_SROA_2_0__SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(2) [[TMP0]], i64 8 +// OPT-SPIRV-NEXT: [[COERCE_SROA_2_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(2) [[COERCE_SROA_2_0__SROA_IDX]], align 8 +// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[COERCE_SROA_0_0_COPYLOAD]], align 4 // OPT-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP1]], 1.000000e+00 -// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[T_SROA_0_0_COPYLOAD]], align 4 -// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[T_SROA_2_0_COPYLOAD]], align 4 +// OPT-SPIRV-NEXT: store float [[ADD]], ptr addrspace(4) [[COERCE_SROA_0_0_COPYLOAD]], align 4 +// OPT-SPIRV-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[COERCE_SROA_2_0_COPYLOAD]], align 4 // OPT-SPIRV-NEXT: [[ADD5:%.*]] = fadd contract float [[TMP2]], 2.000000e+00 -// OPT-SPIRV-NEXT: store float [[ADD5]], ptr addrspace(4) [[T_SROA_2_0_COPYLOAD]], align 4 +// OPT-SPIRV-NEXT: store float [[ADD5]], ptr addrspace(4) [[COERCE_SROA_2_0_COPYLOAD]], align 4 // OPT-SPIRV-NEXT: ret void // // HOST-LABEL: define dso_local void @_Z22__device_stub__kernel61T( @@ -581,9 +587,10 @@ __global__ void kernel6(struct T t) { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(5) [[X]], align 8 -// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr addrspace(5) [[X]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8 // CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 0 @@ -593,13 +600,14 @@ __global__ void kernel6(struct T t) { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi( -// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-SPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) // CHECK-SPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) -// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr [[X]], align 8 -// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8 +// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[X_COERCE]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-SPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 // CHECK-SPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 // CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 // CHECK-SPIRV-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 0 @@ -617,7 +625,7 @@ __global__ void kernel6(struct T t) { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi( -// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// OPT-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[X_COERCE]] to i64 // OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) @@ -652,10 +660,10 @@ struct SS { // CHECK-SAME: ptr addrspace(1) [[A_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_SS:%.*]], align 8, addrspace(5) -// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr addrspace(5) [[A]], i32 0, i32 0 -// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[A_ASCAST]], i32 0, i32 0 +// CHECK-NEXT: [[A1:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[A1]], i32 0, i32 0 +// CHECK-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[COERCE_DIVE]], align 8 +// CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[A1]], i32 0, i32 0 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4 // CHECK-NEXT: [[ADD:%.*]] = fadd contract float [[TMP1]], 3.000000e+00 @@ -663,13 +671,13 @@ struct SS { // CHECK-NEXT: ret void // // CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS( -// CHECK-SPIRV-SAME: ptr addrspace(1) [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// CHECK-SPIRV-SAME: ptr addrspace(1) [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // CHECK-SPIRV-NEXT: [[ENTRY:.*:]] // CHECK-SPIRV-NEXT: [[A:%.*]] = alloca [[STRUCT_SS:%.*]], align 8 -// CHECK-SPIRV-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[A]], i32 0, i32 0 -// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr [[COERCE_DIVE]], align 8 -// CHECK-SPIRV-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4) -// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr addrspace(4) [[A_ASCAST]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: [[A1:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4) +// CHECK-SPIRV-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr addrspace(4) [[A1]], i32 0, i32 0 +// CHECK-SPIRV-NEXT: store ptr addrspace(1) [[A_COERCE]], ptr addrspace(4) [[COERCE_DIVE]], align 8 +// CHECK-SPIRV-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr addrspace(4) [[A1]], i32 0, i32 0 // CHECK-SPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X]], align 8 // CHECK-SPIRV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[TMP0]], align 4 // CHECK-SPIRV-NEXT: [[ADD:%.*]] = fadd contract float [[TMP1]], 3.000000e+00 @@ -685,7 +693,7 @@ struct SS { // OPT-NEXT: ret void // // OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS( -// OPT-SPIRV-SAME: ptr addrspace(1) [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META4]] { +// OPT-SPIRV-SAME: ptr addrspace(1) [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] { // OPT-SPIRV-NEXT: [[ENTRY:.*:]] // OPT-SPIRV-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[A_COERCE]] to i64 // OPT-SPIRV-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) @@ -713,13 +721,13 @@ __global__ void kernel8(struct SS a) { *a.x += 3.f; } //. -// CHECK: [[META3]] = !{} -// CHECK: [[META4]] = !{i64 4} +// CHECK: [[META4]] = !{} +// CHECK: [[META5]] = !{i64 4} //. -// CHECK-SPIRV: [[META4]] = !{i32 1024, i32 1, i32 1} -// CHECK-SPIRV: [[META5]] = !{i64 4} +// CHECK-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1} +// CHECK-SPIRV: [[META6]] = !{i64 4} //. -// OPT: [[META3]] = !{} +// OPT: [[META4]] = !{} //. -// OPT-SPIRV: [[META4]] = !{i32 1024, i32 1, i32 1} +// OPT-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1} //. diff --git a/clang/test/CodeGenCUDA/atomic-options.hip b/clang/test/CodeGenCUDA/atomic-options.hip index 7b319516a1010..28ef6c3e8521f 100644 --- a/clang/test/CodeGenCUDA/atomic-options.hip +++ b/clang/test/CodeGenCUDA/atomic-options.hip @@ -37,13 +37,15 @@ // DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// DEV-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.no.remote.memory [[META3]] -// DEV-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: ret void // // OPT-LABEL: define dso_local void @_Z12test_defaultPf( @@ -53,13 +55,15 @@ // OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// OPT-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META3:![0-9]+]], !amdgpu.ignore.denormal.mode [[META3]] -// OPT-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: ret void // // SPIRV-DEV-LABEL: define spir_func void @_Z12test_defaultPf( @@ -69,13 +73,15 @@ // SPIRV-DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // SPIRV-DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // SPIRV-DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4) // SPIRV-DEV-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // SPIRV-DEV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 -// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 // SPIRV-DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.no.remote.memory [[META4]] -// SPIRV-DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 -// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-DEV-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 // SPIRV-DEV-NEXT: ret void // // SPIRV-OPT-LABEL: define spir_func void @_Z12test_defaultPf( @@ -85,13 +91,15 @@ // SPIRV-OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // SPIRV-OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // SPIRV-OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4) // SPIRV-OPT-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // SPIRV-OPT-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 -// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 // SPIRV-OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4:![0-9]+]], !amdgpu.ignore.denormal.mode [[META4]] -// SPIRV-OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 -// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-OPT-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 // SPIRV-OPT-NEXT: ret void // __device__ __host__ void test_default(float *a) { @@ -120,13 +128,15 @@ __device__ __host__ void test_default(float *a) { // DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// DEV-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// DEV-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: ret void // // OPT-LABEL: define dso_local void @_Z8test_onePf( @@ -136,13 +146,15 @@ __device__ __host__ void test_default(float *a) { // OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// OPT-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]] -// OPT-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: ret void // // SPIRV-DEV-LABEL: define spir_func void @_Z8test_onePf( @@ -152,13 +164,15 @@ __device__ __host__ void test_default(float *a) { // SPIRV-DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // SPIRV-DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // SPIRV-DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4) // SPIRV-DEV-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // SPIRV-DEV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 -// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 // SPIRV-DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] -// SPIRV-DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 -// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-DEV-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 // SPIRV-DEV-NEXT: ret void // // SPIRV-OPT-LABEL: define spir_func void @_Z8test_onePf( @@ -168,13 +182,15 @@ __device__ __host__ void test_default(float *a) { // SPIRV-OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // SPIRV-OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // SPIRV-OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4) // SPIRV-OPT-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // SPIRV-OPT-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 -// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 // SPIRV-OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]] -// SPIRV-OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 -// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-OPT-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 // SPIRV-OPT-NEXT: ret void // __device__ __host__ void test_one(float *a) { @@ -205,13 +221,15 @@ __device__ __host__ void test_one(float *a) { // DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// DEV-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]] -// DEV-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: ret void // // OPT-LABEL: define dso_local void @_Z8test_twoPf( @@ -221,13 +239,15 @@ __device__ __host__ void test_one(float *a) { // OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// OPT-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META3]] -// OPT-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: ret void // // SPIRV-DEV-LABEL: define spir_func void @_Z8test_twoPf( @@ -237,13 +257,15 @@ __device__ __host__ void test_one(float *a) { // SPIRV-DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // SPIRV-DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // SPIRV-DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4) // SPIRV-DEV-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // SPIRV-DEV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 -// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 // SPIRV-DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]] -// SPIRV-DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 -// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-DEV-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 // SPIRV-DEV-NEXT: ret void // // SPIRV-OPT-LABEL: define spir_func void @_Z8test_twoPf( @@ -253,13 +275,15 @@ __device__ __host__ void test_one(float *a) { // SPIRV-OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // SPIRV-OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // SPIRV-OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4) // SPIRV-OPT-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // SPIRV-OPT-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 -// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 // SPIRV-OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META4]] -// SPIRV-OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 -// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-OPT-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 // SPIRV-OPT-NEXT: ret void // __device__ __host__ void test_two(float *a) { @@ -290,13 +314,15 @@ __device__ __host__ void test_two(float *a) { // DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// DEV-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META3]] -// DEV-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: ret void // // OPT-LABEL: define dso_local void @_Z10test_threePf( @@ -306,13 +332,15 @@ __device__ __host__ void test_two(float *a) { // OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// OPT-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META3]] -// OPT-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: ret void // // SPIRV-DEV-LABEL: define spir_func void @_Z10test_threePf( @@ -322,13 +350,15 @@ __device__ __host__ void test_two(float *a) { // SPIRV-DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // SPIRV-DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // SPIRV-DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4) // SPIRV-DEV-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // SPIRV-DEV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 -// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 // SPIRV-DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]] -// SPIRV-DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 -// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-DEV-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 // SPIRV-DEV-NEXT: ret void // // SPIRV-OPT-LABEL: define spir_func void @_Z10test_threePf( @@ -338,13 +368,15 @@ __device__ __host__ void test_two(float *a) { // SPIRV-OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // SPIRV-OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // SPIRV-OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4) // SPIRV-OPT-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // SPIRV-OPT-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 -// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 // SPIRV-OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]] -// SPIRV-OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 -// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-OPT-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 // SPIRV-OPT-NEXT: ret void // __device__ __host__ void test_three(float *a) { @@ -375,13 +407,15 @@ __device__ __host__ void test_three(float *a) { // DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// DEV-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]] -// DEV-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: ret void // // OPT-LABEL: define dso_local void @_Z19test_multiple_attrsPf( @@ -391,13 +425,15 @@ __device__ __host__ void test_three(float *a) { // OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr // OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// OPT-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META3]] -// OPT-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: ret void // // SPIRV-DEV-LABEL: define spir_func void @_Z19test_multiple_attrsPf( @@ -407,13 +443,15 @@ __device__ __host__ void test_three(float *a) { // SPIRV-DEV-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // SPIRV-DEV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // SPIRV-DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4) // SPIRV-DEV-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // SPIRV-DEV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 -// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 // SPIRV-DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]] -// SPIRV-DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 -// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-DEV-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 // SPIRV-DEV-NEXT: ret void // // SPIRV-OPT-LABEL: define spir_func void @_Z19test_multiple_attrsPf( @@ -423,13 +461,15 @@ __device__ __host__ void test_three(float *a) { // SPIRV-OPT-NEXT: [[DOTATOMICTMP:%.*]] = alloca float, align 4 // SPIRV-OPT-NEXT: [[ATOMIC_TEMP:%.*]] = alloca float, align 4 // SPIRV-OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4) // SPIRV-OPT-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // SPIRV-OPT-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 -// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 // SPIRV-OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.ignore.denormal.mode [[META4]] -// SPIRV-OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 -// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-OPT-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 // SPIRV-OPT-NEXT: ret void // __device__ __host__ void test_multiple_attrs(float *a) { @@ -490,31 +530,39 @@ __device__ __host__ void test_multiple_attrs(float *a) { // DEV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca float, align 4, addrspace(5) // DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// DEV-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// DEV-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// DEV-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// DEV-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// DEV-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// DEV-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr // DEV-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // DEV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// DEV-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] -// DEV-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // DEV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// DEV-NEXT: store float 2.000000e+00, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// DEV-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// DEV-NEXT: store float 2.000000e+00, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// DEV-NEXT: [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // DEV-NEXT: [[TMP6:%.*]] = atomicrmw fmax ptr [[TMP4]], float [[TMP5]] syncscope("agent") seq_cst, align 4 -// DEV-NEXT: store float [[TMP6]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// DEV-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// DEV-NEXT: store float [[TMP6]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// DEV-NEXT: [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // DEV-NEXT: [[TMP8:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// DEV-NEXT: store float 3.000000e+00, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// DEV-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// DEV-NEXT: store float 3.000000e+00, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// DEV-NEXT: [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // DEV-NEXT: [[TMP10:%.*]] = atomicrmw fmin ptr [[TMP8]], float [[TMP9]] syncscope("workgroup") acquire, align 4, !amdgpu.no.remote.memory [[META3]] -// DEV-NEXT: store float [[TMP10]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// DEV-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// DEV-NEXT: store float [[TMP10]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// DEV-NEXT: [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // DEV-NEXT: [[TMP12:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// DEV-NEXT: store float 4.000000e+00, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// DEV-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// DEV-NEXT: store float 4.000000e+00, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// DEV-NEXT: [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // DEV-NEXT: [[TMP14:%.*]] = atomicrmw fsub ptr [[TMP12]], float [[TMP13]] syncscope("wavefront") release, align 4, !amdgpu.no.fine.grained.memory [[META3]] -// DEV-NEXT: store float [[TMP14]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// DEV-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// DEV-NEXT: store float [[TMP14]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// DEV-NEXT: [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // DEV-NEXT: ret void // // OPT-LABEL: define dso_local void @_Z11test_nestedPf( @@ -530,31 +578,39 @@ __device__ __host__ void test_multiple_attrs(float *a) { // OPT-NEXT: [[DOTATOMICTMP5:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca float, align 4, addrspace(5) // OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// OPT-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// OPT-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// OPT-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// OPT-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// OPT-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// OPT-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr // OPT-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 // OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// OPT-NEXT: store float 1.000000e+00, ptr addrspace(5) [[DOTATOMICTMP]], align 4 -// OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP]], align 4 +// OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP_ASCAST]], align 4 // OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META3]], !amdgpu.ignore.denormal.mode [[META3]] -// OPT-NEXT: store float [[TMP2]], ptr addrspace(5) [[ATOMIC_TEMP]], align 4 -// OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP]], align 4 +// OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // OPT-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// OPT-NEXT: store float 2.000000e+00, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 -// OPT-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP1]], align 4 +// OPT-NEXT: store float 2.000000e+00, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// OPT-NEXT: [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // OPT-NEXT: [[TMP6:%.*]] = atomicrmw fmax ptr [[TMP4]], float [[TMP5]] syncscope("agent") seq_cst, align 4 -// OPT-NEXT: store float [[TMP6]], ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 -// OPT-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP2]], align 4 +// OPT-NEXT: store float [[TMP6]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// OPT-NEXT: [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // OPT-NEXT: [[TMP8:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// OPT-NEXT: store float 3.000000e+00, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 -// OPT-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP3]], align 4 +// OPT-NEXT: store float 3.000000e+00, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// OPT-NEXT: [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // OPT-NEXT: [[TMP10:%.*]] = atomicrmw fmin ptr [[TMP8]], float [[TMP9]] syncscope("workgroup") acquire, align 4, !amdgpu.no.remote.memory [[META3]] -// OPT-NEXT: store float [[TMP10]], ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 -// OPT-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP4]], align 4 +// OPT-NEXT: store float [[TMP10]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// OPT-NEXT: [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // OPT-NEXT: [[TMP12:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 -// OPT-NEXT: store float 4.000000e+00, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 -// OPT-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[DOTATOMICTMP5]], align 4 +// OPT-NEXT: store float 4.000000e+00, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// OPT-NEXT: [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // OPT-NEXT: [[TMP14:%.*]] = atomicrmw fsub ptr [[TMP12]], float [[TMP13]] syncscope("wavefront") release, align 4, !amdgpu.no.fine.grained.memory [[META3]] -// OPT-NEXT: store float [[TMP14]], ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 -// OPT-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[ATOMIC_TEMP6]], align 4 +// OPT-NEXT: store float [[TMP14]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// OPT-NEXT: [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // OPT-NEXT: ret void // // SPIRV-DEV-LABEL: define spir_func void @_Z11test_nestedPf( @@ -570,31 +626,39 @@ __device__ __host__ void test_multiple_attrs(float *a) { // SPIRV-DEV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca float, align 4 // SPIRV-DEV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca float, align 4 // SPIRV-DEV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP1]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP2]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP3]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP4]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP5]] to ptr addrspace(4) +// SPIRV-DEV-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP6]] to ptr addrspace(4) // SPIRV-DEV-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // SPIRV-DEV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 -// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-DEV-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 // SPIRV-DEV-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] -// SPIRV-DEV-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 -// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-DEV-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 // SPIRV-DEV-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-DEV-NEXT: store float 2.000000e+00, ptr [[DOTATOMICTMP1]], align 4 -// SPIRV-DEV-NEXT: [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-DEV-NEXT: store float 2.000000e+00, ptr addrspace(4) [[DOTATOMICTMP1_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP1_ASCAST]], align 4 // SPIRV-DEV-NEXT: [[TMP6:%.*]] = atomicrmw fmax ptr addrspace(4) [[TMP4]], float [[TMP5]] syncscope("device") seq_cst, align 4 -// SPIRV-DEV-NEXT: store float [[TMP6]], ptr [[ATOMIC_TEMP2]], align 4 -// SPIRV-DEV-NEXT: [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-DEV-NEXT: store float [[TMP6]], ptr addrspace(4) [[ATOMIC_TEMP2_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP2_ASCAST]], align 4 // SPIRV-DEV-NEXT: [[TMP8:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-DEV-NEXT: store float 3.000000e+00, ptr [[DOTATOMICTMP3]], align 4 -// SPIRV-DEV-NEXT: [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-DEV-NEXT: store float 3.000000e+00, ptr addrspace(4) [[DOTATOMICTMP3_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP3_ASCAST]], align 4 // SPIRV-DEV-NEXT: [[TMP10:%.*]] = atomicrmw fmin ptr addrspace(4) [[TMP8]], float [[TMP9]] syncscope("workgroup") acquire, align 4, !amdgpu.no.remote.memory [[META4]] -// SPIRV-DEV-NEXT: store float [[TMP10]], ptr [[ATOMIC_TEMP4]], align 4 -// SPIRV-DEV-NEXT: [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-DEV-NEXT: store float [[TMP10]], ptr addrspace(4) [[ATOMIC_TEMP4_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP4_ASCAST]], align 4 // SPIRV-DEV-NEXT: [[TMP12:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-DEV-NEXT: store float 4.000000e+00, ptr [[DOTATOMICTMP5]], align 4 -// SPIRV-DEV-NEXT: [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-DEV-NEXT: store float 4.000000e+00, ptr addrspace(4) [[DOTATOMICTMP5_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP5_ASCAST]], align 4 // SPIRV-DEV-NEXT: [[TMP14:%.*]] = atomicrmw fsub ptr addrspace(4) [[TMP12]], float [[TMP13]] syncscope("subgroup") release, align 4, !amdgpu.no.fine.grained.memory [[META4]] -// SPIRV-DEV-NEXT: store float [[TMP14]], ptr [[ATOMIC_TEMP6]], align 4 -// SPIRV-DEV-NEXT: [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-DEV-NEXT: store float [[TMP14]], ptr addrspace(4) [[ATOMIC_TEMP6_ASCAST]], align 4 +// SPIRV-DEV-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP6_ASCAST]], align 4 // SPIRV-DEV-NEXT: ret void // // SPIRV-OPT-LABEL: define spir_func void @_Z11test_nestedPf( @@ -610,31 +674,39 @@ __device__ __host__ void test_multiple_attrs(float *a) { // SPIRV-OPT-NEXT: [[DOTATOMICTMP5:%.*]] = alloca float, align 4 // SPIRV-OPT-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca float, align 4 // SPIRV-OPT-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP1]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP2]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP3]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP4]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr [[DOTATOMICTMP5]] to ptr addrspace(4) +// SPIRV-OPT-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr [[ATOMIC_TEMP6]] to ptr addrspace(4) // SPIRV-OPT-NEXT: store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // SPIRV-OPT-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr [[DOTATOMICTMP]], align 4 -// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-OPT-NEXT: store float 1.000000e+00, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP_ASCAST]], align 4 // SPIRV-OPT-NEXT: [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(4) [[TMP0]], float [[TMP1]] monotonic, align 4, !amdgpu.no.remote.memory [[META4]], !amdgpu.ignore.denormal.mode [[META4]] -// SPIRV-OPT-NEXT: store float [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 -// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-OPT-NEXT: store float [[TMP2]], ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP_ASCAST]], align 4 // SPIRV-OPT-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-OPT-NEXT: store float 2.000000e+00, ptr [[DOTATOMICTMP1]], align 4 -// SPIRV-OPT-NEXT: [[TMP5:%.*]] = load float, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-OPT-NEXT: store float 2.000000e+00, ptr addrspace(4) [[DOTATOMICTMP1_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP1_ASCAST]], align 4 // SPIRV-OPT-NEXT: [[TMP6:%.*]] = atomicrmw fmax ptr addrspace(4) [[TMP4]], float [[TMP5]] syncscope("device") seq_cst, align 4 -// SPIRV-OPT-NEXT: store float [[TMP6]], ptr [[ATOMIC_TEMP2]], align 4 -// SPIRV-OPT-NEXT: [[TMP7:%.*]] = load float, ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-OPT-NEXT: store float [[TMP6]], ptr addrspace(4) [[ATOMIC_TEMP2_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP2_ASCAST]], align 4 // SPIRV-OPT-NEXT: [[TMP8:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-OPT-NEXT: store float 3.000000e+00, ptr [[DOTATOMICTMP3]], align 4 -// SPIRV-OPT-NEXT: [[TMP9:%.*]] = load float, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-OPT-NEXT: store float 3.000000e+00, ptr addrspace(4) [[DOTATOMICTMP3_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP3_ASCAST]], align 4 // SPIRV-OPT-NEXT: [[TMP10:%.*]] = atomicrmw fmin ptr addrspace(4) [[TMP8]], float [[TMP9]] syncscope("workgroup") acquire, align 4, !amdgpu.no.remote.memory [[META4]] -// SPIRV-OPT-NEXT: store float [[TMP10]], ptr [[ATOMIC_TEMP4]], align 4 -// SPIRV-OPT-NEXT: [[TMP11:%.*]] = load float, ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-OPT-NEXT: store float [[TMP10]], ptr addrspace(4) [[ATOMIC_TEMP4_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP4_ASCAST]], align 4 // SPIRV-OPT-NEXT: [[TMP12:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 -// SPIRV-OPT-NEXT: store float 4.000000e+00, ptr [[DOTATOMICTMP5]], align 4 -// SPIRV-OPT-NEXT: [[TMP13:%.*]] = load float, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-OPT-NEXT: store float 4.000000e+00, ptr addrspace(4) [[DOTATOMICTMP5_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(4) [[DOTATOMICTMP5_ASCAST]], align 4 // SPIRV-OPT-NEXT: [[TMP14:%.*]] = atomicrmw fsub ptr addrspace(4) [[TMP12]], float [[TMP13]] syncscope("subgroup") release, align 4, !amdgpu.no.fine.grained.memory [[META4]] -// SPIRV-OPT-NEXT: store float [[TMP14]], ptr [[ATOMIC_TEMP6]], align 4 -// SPIRV-OPT-NEXT: [[TMP15:%.*]] = load float, ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-OPT-NEXT: store float [[TMP14]], ptr addrspace(4) [[ATOMIC_TEMP6_ASCAST]], align 4 +// SPIRV-OPT-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(4) [[ATOMIC_TEMP6_ASCAST]], align 4 // SPIRV-OPT-NEXT: ret void // __device__ __host__ void test_nested(float *a) { diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu index 35673773ec80c..7edf64db91f2e 100644 --- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu +++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu @@ -14,10 +14,11 @@ // CHECK-NEXT: [[OUT:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[DISPATCH_PTR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr // CHECK-NEXT: [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DISPATCH_PTR]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(5) [[OUT]], align 8 -// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr addrspace(5) [[OUT]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() // CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr @@ -38,10 +39,11 @@ __global__ void use_dispatch_ptr(int* out) { // CHECK-NEXT: [[OUT:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[QUEUE_PTR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr // CHECK-NEXT: [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[QUEUE_PTR]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(5) [[OUT]], align 8 -// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr addrspace(5) [[OUT]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() // CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr @@ -62,10 +64,11 @@ __global__ void use_queue_ptr(int* out) { // CHECK-NEXT: [[OUT:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr // CHECK-NEXT: [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IMPLICITARG_PTR]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(5) [[OUT]], align 8 -// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr addrspace(5) [[OUT]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr @@ -124,11 +127,12 @@ __global__ void test_ds_fadd(float src) { // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[X:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED]] to ptr // CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr // CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED_ADDR]] to ptr // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(5) [[SHARED]], align 8 -// CHECK-NEXT: [[SHARED1:%.*]] = load ptr, ptr addrspace(5) [[SHARED]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: [[SHARED1:%.*]] = load ptr, ptr [[SHARED_ASCAST]], align 8 // CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: store ptr [[SHARED1]], ptr [[SHARED_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8 @@ -172,11 +176,12 @@ __global__ void endpgm() { // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8, addrspace(5) // CHECK-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr // CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(5) [[OUT]], align 8 -// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr addrspace(5) [[OUT]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i64 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i64 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 8 @@ -198,9 +203,10 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(5) [[OUT]], align 8 -// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr addrspace(5) [[OUT]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 @@ -221,11 +227,12 @@ __device__ void func(float *x); // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) // CHECK-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[X:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED]] to ptr // CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr // CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SHARED_ADDR]] to ptr // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(5) [[SHARED]], align 8 -// CHECK-NEXT: [[SHARED1:%.*]] = load ptr, ptr addrspace(5) [[SHARED]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: [[SHARED1:%.*]] = load ptr, ptr [[SHARED_ASCAST]], align 8 // CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: store ptr [[SHARED1]], ptr [[SHARED_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8 @@ -247,10 +254,11 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) { // CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[RET:%.*]] = alloca i8, align 1, addrspace(5) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr // CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(5) [[X]], align 8 -// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr addrspace(5) [[X]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8 // CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[TMP0]]) @@ -267,10 +275,11 @@ __global__ void test_is_shared(float *x){ // CHECK-NEXT: [[X:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[RET:%.*]] = alloca i8, align 1, addrspace(5) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr // CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(5) [[X]], align 8 -// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr addrspace(5) [[X]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr, ptr [[X_ASCAST]], align 8 // CHECK-NEXT: store ptr [[X1]], ptr [[X_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[TMP0]]) diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu index 94338f9027db1..7f48a8608af1d 100644 --- a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu +++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu @@ -14,10 +14,11 @@ // CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[DISPATCH_PTR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr [[DISPATCH_PTR]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8 -// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() // CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8 @@ -32,10 +33,11 @@ // AMDGCNSPIRV-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[DISPATCH_PTR:%.*]] = alloca ptr addrspace(4), align 8 +// AMDGCNSPIRV-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr [[DISPATCH_PTR]] to ptr addrspace(4) -// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8 -// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8 +// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() // AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8 @@ -55,10 +57,11 @@ __global__ void use_dispatch_ptr(int* out) { // CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[QUEUE_PTR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr [[QUEUE_PTR]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8 -// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr() // CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8 @@ -73,10 +76,11 @@ __global__ void use_dispatch_ptr(int* out) { // AMDGCNSPIRV-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[QUEUE_PTR:%.*]] = alloca ptr addrspace(4), align 8 +// AMDGCNSPIRV-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr [[QUEUE_PTR]] to ptr addrspace(4) -// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8 -// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8 +// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr() // AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8 @@ -96,10 +100,11 @@ __global__ void use_queue_ptr(int* out) { // CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr [[IMPLICITARG_PTR]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8 -// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8 @@ -114,10 +119,11 @@ __global__ void use_queue_ptr(int* out) { // AMDGCNSPIRV-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[IMPLICITARG_PTR:%.*]] = alloca ptr addrspace(4), align 8 +// AMDGCNSPIRV-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr [[IMPLICITARG_PTR]] to ptr addrspace(4) -// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8 -// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8 +// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8 @@ -198,11 +204,12 @@ __global__ void test_ds_fadd(float src) { // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4 // CHECK-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[X:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4) // CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED]], align 8 -// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr [[SHARED]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8 // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 @@ -218,11 +225,12 @@ __global__ void test_ds_fadd(float src) { // AMDGCNSPIRV-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4 // AMDGCNSPIRV-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[X:%.*]] = alloca float, align 4 +// AMDGCNSPIRV-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) -// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED]], align 8 -// AMDGCNSPIRV-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr [[SHARED]], align 8 +// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// AMDGCNSPIRV-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 // AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 @@ -265,11 +273,12 @@ __global__ void endpgm() { // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // CHECK-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8 -// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i64 [[A:%.*]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i64 [[B:%.*]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 8 @@ -286,11 +295,12 @@ __global__ void endpgm() { // AMDGCNSPIRV-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 // AMDGCNSPIRV-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// AMDGCNSPIRV-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4) -// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8 -// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8 +// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: store i64 [[A:%.*]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: store i64 [[B:%.*]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 8 @@ -312,9 +322,10 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8 -// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META5:![0-9]+]]) // CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] @@ -330,9 +341,10 @@ __global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, un // AMDGCNSPIRV-NEXT: entry: // AMDGCNSPIRV-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// AMDGCNSPIRV-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) -// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr [[OUT]], align 8 -// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8 +// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[OUT_COERCE:%.*]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// AMDGCNSPIRV-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = call addrspace(4) i1 @llvm.spv.named.boolean.spec.constant(i32 -1, i1 false, metadata [[META7:![0-9]+]]) // AMDGCNSPIRV-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] @@ -359,11 +371,12 @@ __device__ void func(float *x); // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4 // CHECK-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[X:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4) // CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED]], align 8 -// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr [[SHARED]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8 // CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 // CHECK-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 @@ -381,11 +394,12 @@ __device__ void func(float *x); // AMDGCNSPIRV-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4 // AMDGCNSPIRV-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[X:%.*]] = alloca float, align 4 +// AMDGCNSPIRV-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) -// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr [[SHARED]], align 8 -// AMDGCNSPIRV-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr [[SHARED]], align 8 +// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[SHARED_COERCE:%.*]], ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// AMDGCNSPIRV-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 // AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 @@ -407,10 +421,11 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) { // CHECK-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[RET:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X]], align 8 -// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr @@ -424,10 +439,11 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) { // AMDGCNSPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[RET:%.*]] = alloca i8, align 1 +// AMDGCNSPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4) -// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X]], align 8 -// AMDGCNSPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8 +// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8 +// AMDGCNSPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr @@ -445,10 +461,11 @@ __global__ void test_is_shared(float *x){ // CHECK-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // CHECK-NEXT: [[RET:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) // CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4) -// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X]], align 8 -// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 // CHECK-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr @@ -462,10 +479,11 @@ __global__ void test_is_shared(float *x){ // AMDGCNSPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 // AMDGCNSPIRV-NEXT: [[RET:%.*]] = alloca i8, align 1 +// AMDGCNSPIRV-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) // AMDGCNSPIRV-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4) -// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr [[X]], align 8 -// AMDGCNSPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr [[X]], align 8 +// AMDGCNSPIRV-NEXT: store ptr addrspace(1) [[X_COERCE:%.*]], ptr addrspace(4) [[X_ASCAST]], align 8 +// AMDGCNSPIRV-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 // AMDGCNSPIRV-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr diff --git a/clang/test/CodeGenCUDA/record-layout.cu b/clang/test/CodeGenCUDA/record-layout.cu index 12ce64f2be56d..847a81d88d280 100644 --- a/clang/test/CodeGenCUDA/record-layout.cu +++ b/clang/test/CodeGenCUDA/record-layout.cu @@ -65,9 +65,10 @@ struct J : I { }; // DEV: define dso_local amdgpu_kernel void @_Z8C_kernel1C(ptr addrspace(4) noundef byref(%struct.C) align 4 %0) -// DEV: %c = alloca %struct.C, align 4, addrspace(5) -// DEV: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 4 %c, ptr addrspace(4) align 4 %0, i64 8, i1 false) -// DEV: %i = getelementptr inbounds nuw %struct.C, ptr %c.ascast, i32 0, i32 1 +// DEV: %coerce = alloca %struct.C, align 4, addrspace(5) +// DEV: %c = addrspacecast ptr addrspace(5) %coerce to ptr +// DEV: call void @llvm.memcpy.p0.p4.i64(ptr align 4 %c, ptr addrspace(4) align 4 %0, i64 8, i1 false) +// DEV: %i = getelementptr inbounds nuw %struct.C, ptr %c, i32 0, i32 1 // DEV: store i32 1, ptr %i, align 4 __global__ void C_kernel(C c) diff --git a/clang/test/CodeGenCXX/amdgcn-func-arg.cpp b/clang/test/CodeGenCXX/amdgcn-func-arg.cpp index 3304be8eddade..3cc5dd7828464 100644 --- a/clang/test/CodeGenCXX/amdgcn-func-arg.cpp +++ b/clang/test/CodeGenCXX/amdgcn-func-arg.cpp @@ -1,30 +1,10 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -O0 -triple amdgcn -emit-llvm %s -o - | FileCheck %s class A { public: int x; -// CHECK-LABEL: define linkonce_odr void @_ZN1AC1Ev( -// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[THIS_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[THIS_ADDR]] to ptr -// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR_ASCAST]], align 8 -// CHECK-NEXT: call void @_ZN1AC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5:[0-9]+]] -// CHECK-NEXT: ret void -// A():x(0) {} -// CHECK-LABEL: define linkonce_odr void @_ZN1AD1Ev( -// CHECK-SAME: ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[THIS_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[THIS_ADDR]] to ptr -// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR_ASCAST]], align 8 -// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR_ASCAST]], align 8 -// CHECK-NEXT: call void @_ZN1AD2Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[THIS1]]) #[[ATTR6:[0-9]+]] -// CHECK-NEXT: ret void -// ~A() {} }; @@ -38,13 +18,13 @@ B g_b; void func_with_ref_arg(A &a); void func_with_ref_arg(B &b); -// CHECK-LABEL: define dso_local void @_Z22func_with_indirect_arg1A( -// CHECK-SAME: ptr addrspace(5) noundef [[A:%.*]]) #[[ATTR1]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: @_Z22func_with_indirect_arg1A( +// CHECK-NEXT: entry: // CHECK-NEXT: [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[P:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_INDIRECT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_INDIRECT_ADDR]] to ptr // CHECK-NEXT: [[P_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P]] to ptr -// CHECK-NEXT: store ptr addrspace(5) [[A]], ptr addrspace(5) [[A_INDIRECT_ADDR]], align 8 +// CHECK-NEXT: store ptr addrspace(5) [[A:%.*]], ptr [[A_INDIRECT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr // CHECK-NEXT: store ptr [[A_ASCAST]], ptr [[P_ASCAST]], align 8 // CHECK-NEXT: ret void @@ -53,19 +33,19 @@ void func_with_indirect_arg(A a) { A *p = &a; } -// CHECK-LABEL: define dso_local void @_Z22test_indirect_arg_autov( -// CHECK-SAME: ) #[[ATTR1]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: @_Z22test_indirect_arg_autov( +// CHECK-NEXT: entry: // CHECK-NEXT: [[A:%.*]] = alloca [[CLASS_A:%.*]], align 4, addrspace(5) // CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[CLASS_A]], align 4, addrspace(5) // CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -// CHECK-NEXT: call void @_ZN1AC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[A_ASCAST]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 4 [[AGG_TMP]], ptr align 4 [[A_ASCAST]], i64 4, i1 false) -// CHECK-NEXT: call void @_Z22func_with_indirect_arg1A(ptr addrspace(5) noundef [[AGG_TMP]]) #[[ATTR5]] // CHECK-NEXT: [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr -// CHECK-NEXT: call void @_ZN1AD1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[AGG_TMP_ASCAST]]) #[[ATTR6]] -// CHECK-NEXT: call void @_Z17func_with_ref_argR1A(ptr noundef nonnull align 4 dereferenceable(4) [[A_ASCAST]]) #[[ATTR5]] -// CHECK-NEXT: call void @_ZN1AD1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[A_ASCAST]]) #[[ATTR6]] +// CHECK-NEXT: call void @_ZN1AC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[A_ASCAST]]) +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 [[A_ASCAST]], i64 4, i1 false) +// CHECK-NEXT: [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5) +// CHECK-NEXT: call void @_Z22func_with_indirect_arg1A(ptr addrspace(5) noundef [[AGG_TMP_ASCAST_ASCAST]]) +// CHECK-NEXT: call void @_ZN1AD1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[AGG_TMP_ASCAST]]) +// CHECK-NEXT: call void @_Z17func_with_ref_argR1A(ptr noundef nonnull align 4 dereferenceable(4) [[A_ASCAST]]) +// CHECK-NEXT: call void @_ZN1AD1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[A_ASCAST]]) // CHECK-NEXT: ret void // void test_indirect_arg_auto() { @@ -74,15 +54,15 @@ void test_indirect_arg_auto() { func_with_ref_arg(a); } -// CHECK-LABEL: define dso_local void @_Z24test_indirect_arg_globalv( -// CHECK-SAME: ) #[[ATTR1]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: @_Z24test_indirect_arg_globalv( +// CHECK-NEXT: entry: // CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[CLASS_A:%.*]], align 4, addrspace(5) -// CHECK-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 4 [[AGG_TMP]], ptr align 4 addrspacecast (ptr addrspace(1) @g_a to ptr), i64 4, i1 false) -// CHECK-NEXT: call void @_Z22func_with_indirect_arg1A(ptr addrspace(5) noundef [[AGG_TMP]]) #[[ATTR5]] // CHECK-NEXT: [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr -// CHECK-NEXT: call void @_ZN1AD1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[AGG_TMP_ASCAST]]) #[[ATTR6]] -// CHECK-NEXT: call void @_Z17func_with_ref_argR1A(ptr noundef nonnull align 4 dereferenceable(4) addrspacecast (ptr addrspace(1) @g_a to ptr)) #[[ATTR5]] +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 addrspacecast (ptr addrspace(1) @g_a to ptr), i64 4, i1 false) +// CHECK-NEXT: [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5) +// CHECK-NEXT: call void @_Z22func_with_indirect_arg1A(ptr addrspace(5) noundef [[AGG_TMP_ASCAST_ASCAST]]) +// CHECK-NEXT: call void @_ZN1AD1Ev(ptr noundef nonnull align 4 dead_on_return(4) dereferenceable(4) [[AGG_TMP_ASCAST]]) +// CHECK-NEXT: call void @_Z17func_with_ref_argR1A(ptr noundef nonnull align 4 dereferenceable(4) addrspacecast (ptr addrspace(1) @g_a to ptr)) // CHECK-NEXT: ret void // void test_indirect_arg_global() { @@ -90,30 +70,30 @@ void test_indirect_arg_global() { func_with_ref_arg(g_a); } -// CHECK-LABEL: define dso_local void @_Z19func_with_byval_arg1B( -// CHECK-SAME: ptr addrspace(5) noundef byref([[CLASS_B:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR1]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[B:%.*]] = alloca [[CLASS_B]], align 4, addrspace(5) +// CHECK-LABEL: @_Z19func_with_byval_arg1B( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[COERCE:%.*]] = alloca [[CLASS_B:%.*]], align 4, addrspace(5) // CHECK-NEXT: [[P:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr // CHECK-NEXT: [[P_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P]] to ptr -// CHECK-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[B]], ptr addrspace(5) align 4 [[TMP0]], i64 400, i1 false) -// CHECK-NEXT: [[B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B]] to ptr -// CHECK-NEXT: store ptr [[B_ASCAST]], ptr [[P_ASCAST]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[B]], ptr addrspace(5) align 4 [[TMP0:%.*]], i64 400, i1 false) +// CHECK-NEXT: store ptr [[B]], ptr [[P_ASCAST]], align 8 // CHECK-NEXT: ret void // void func_with_byval_arg(B b) { B *p = &b; } -// CHECK-LABEL: define dso_local void @_Z19test_byval_arg_autov( -// CHECK-SAME: ) #[[ATTR1]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: @_Z19test_byval_arg_autov( +// CHECK-NEXT: entry: // CHECK-NEXT: [[B:%.*]] = alloca [[CLASS_B:%.*]], align 4, addrspace(5) // CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[CLASS_B]], align 4, addrspace(5) // CHECK-NEXT: [[B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B]] to ptr -// CHECK-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 4 [[AGG_TMP]], ptr align 4 [[B_ASCAST]], i64 400, i1 false) -// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP]]) #[[ATTR5]] -// CHECK-NEXT: call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) [[B_ASCAST]]) #[[ATTR5]] +// CHECK-NEXT: [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 [[B_ASCAST]], i64 400, i1 false) +// CHECK-NEXT: [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5) +// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]]) +// CHECK-NEXT: call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) [[B_ASCAST]]) // CHECK-NEXT: ret void // void test_byval_arg_auto() { @@ -122,13 +102,14 @@ void test_byval_arg_auto() { func_with_ref_arg(b); } -// CHECK-LABEL: define dso_local void @_Z21test_byval_arg_globalv( -// CHECK-SAME: ) #[[ATTR1]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: @_Z21test_byval_arg_globalv( +// CHECK-NEXT: entry: // CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[CLASS_B:%.*]], align 4, addrspace(5) -// CHECK-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 4 [[AGG_TMP]], ptr align 4 addrspacecast (ptr addrspace(1) @g_b to ptr), i64 400, i1 false) -// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP]]) #[[ATTR5]] -// CHECK-NEXT: call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) addrspacecast (ptr addrspace(1) @g_b to ptr)) #[[ATTR5]] +// CHECK-NEXT: [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 addrspacecast (ptr addrspace(1) @g_b to ptr), i64 400, i1 false) +// CHECK-NEXT: [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5) +// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]]) +// CHECK-NEXT: call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) addrspacecast (ptr addrspace(1) @g_b to ptr)) // CHECK-NEXT: ret void // void test_byval_arg_global() { diff --git a/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp b/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp index 38b58203be745..62ea3c991f26c 100644 --- a/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp +++ b/clang/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp @@ -109,9 +109,7 @@ std::initializer_list thread_local x = {1, 2, 3, 4}; // AMDGCN: store ptr addrspacecast ({{[^@]+}} @_ZGR15globalInitList2_ {{[^)]+}}), // AMDGCN: ptr addrspacecast ({{[^@]+}} @globalInitList2 {{[^)]+}}), align 8 // AMDGCN: store i64 2, ptr getelementptr inbounds nuw (i8, ptr addrspacecast ({{[^@]+}} @globalInitList2 {{[^)]+}}), i64 8), align 8 -// AMDGCN-NEXT: {{.*}} = addrspacecast ptr addrspace(5) {{.*}} to ptr // CHECK: call void @_ZN10destroyme1D1Ev -// AMDGCN-NEXT: {{.*}} = addrspacecast ptr addrspace(5) {{.*}} to ptr // CHECK-NEXT: call void @_ZN10destroyme1D1Ev // CHECK-NEXT: ret void std::initializer_list globalInitList2 = { @@ -123,6 +121,7 @@ void fn1(int i) { // temporary array // X86: [[array:%[^ ]+]] = alloca [3 x i32] // AMDGCN: [[alloca:%[^ ]+]] = alloca [3 x i32], align 4, addrspace(5) + // AMDGCN: [[array:%[^ ]+]] ={{.*}} addrspacecast ptr addrspace(5) [[alloca]] to ptr // CHECK: store i32 1, ptr // CHECK-NEXT: getelementptr // CHECK-NEXT: store @@ -483,7 +482,7 @@ namespace B19773010 { // CHECK-LABEL: @_ZN9B197730102f1Ev testcase a{{"", ENUM_CONSTANT}}; // X86: store ptr @.ref.tmp{{.*}}, ptr %{{.*}}, align 8 - // AMDGCN: store ptr addrspacecast{{.*}} @.ref.tmp{{.*}}{{.*}}, ptr addrspace(5) %{{.*}}, align 8 + // AMDGCN: store ptr addrspacecast{{.*}} @.ref.tmp{{.*}}{{.*}}, ptr %{{.*}}, align 8 } void f2() { // CHECK-LABEL: @_ZN9B197730102f2Ev diff --git a/clang/test/CodeGenHIP/placement-new-addrspace.hip b/clang/test/CodeGenHIP/placement-new-addrspace.hip index eceb9b5bb3dd9..48a401baf9a78 100644 --- a/clang/test/CodeGenHIP/placement-new-addrspace.hip +++ b/clang/test/CodeGenHIP/placement-new-addrspace.hip @@ -33,9 +33,10 @@ __attribute__((device)) Big make_big() { return Big(7); } // CHECK-NEXT: [[OUT:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_BIG:%.*]], align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT]] to ptr // CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE]], ptr addrspace(5) [[OUT]], align 8 -// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr addrspace(5) [[OUT]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUT_COERCE]], ptr [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr, ptr [[OUT_ASCAST]], align 8 // CHECK-NEXT: store ptr [[OUT1]], ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 // CHECK-NEXT: call void @_Z8make_bigv(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_BIG]]) align 4 [[TMP]]) #[[ATTR3]] diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl index 5c33c5ca8a4f9..32ab1372ae591 100644 --- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -731,10 +731,11 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN20-NEXT: [[ENTRY:.*:]] // AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr // AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 // AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN20-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 // AMDGCN20-NEXT: ret void @@ -745,9 +746,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN20-NEXT: [[ENTRY:.*:]] // AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr // AMDGCN20-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) -// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 // AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 // AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 @@ -856,12 +858,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN20-NEXT: [[ENTRY:.*:]] // AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) // AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr // AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN20-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 // AMDGCN20-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN20-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 -// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 // AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN20-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 // AMDGCN20-NEXT: ret void @@ -872,9 +875,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN20-NEXT: [[ENTRY:.*:]] // AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr // AMDGCN20-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) -// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 // AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 // AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl index 9dabe64d8b75b..ffeb942b6e0a3 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl @@ -205,10 +205,11 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr // AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 // AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 // AMDGCN-NEXT: ret void @@ -219,9 +220,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr // AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) -// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 // AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 // AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 @@ -330,12 +332,13 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 // AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 -// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 // AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 // AMDGCN-NEXT: ret void @@ -346,9 +349,10 @@ kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr // AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) -// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 // AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 // AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl index 87de7a1087411..6b94d5b868cec 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -129,6 +129,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[BLOCK_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr // NOCPU-NEXT: [[BLOCK3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr // NOCPU-NEXT: [[BLOCK12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr +// NOCPU-NEXT: [[BLOCK_SIZES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_SIZES]] to ptr // NOCPU-NEXT: [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr // NOCPU-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 // NOCPU-NEXT: store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1 @@ -479,6 +480,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr // GFX900-NEXT: [[BLOCK3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr // GFX900-NEXT: [[BLOCK12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr +// GFX900-NEXT: [[BLOCK_SIZES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_SIZES]] to ptr // GFX900-NEXT: [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr // GFX900-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[CHARPTR_TBAA15]] // GFX900-NEXT: store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[CHAR_TBAA17]] diff --git a/clang/test/CodeGenOpenCL/atomic-ops.cl b/clang/test/CodeGenOpenCL/atomic-ops.cl index 18a3401f3a339..28d1f572421f6 100644 --- a/clang/test/CodeGenOpenCL/atomic-ops.cl +++ b/clang/test/CodeGenOpenCL/atomic-ops.cl @@ -344,11 +344,12 @@ int test_volatile(volatile atomic_int *i) { // CHECK-LABEL: @test_volatile // CHECK: %[[i_addr:.*]] = alloca ptr // CHECK-NEXT: %[[atomicdst:.*]] = alloca i32 + // CHECK-NEXT: %[[atomicdst_ascast:.*]] = addrspacecast ptr addrspace(5) %[[atomicdst]] to ptr // CHECK-NEXT: store ptr %i, ptr addrspace(5) %[[i_addr]] // CHECK-NEXT: %[[addr:.*]] = load ptr, ptr addrspace(5) %[[i_addr]] // CHECK-NEXT: %[[res:.*]] = load atomic volatile i32, ptr %[[addr]] syncscope("workgroup") seq_cst, align 4{{$}} - // CHECK-NEXT: store i32 %[[res]], ptr addrspace(5) %[[atomicdst]] - // CHECK-NEXT: %[[retval:.*]] = load i32, ptr addrspace(5) %[[atomicdst]] + // CHECK-NEXT: store i32 %[[res]], ptr %[[atomicdst_ascast]] + // CHECK-NEXT: %[[retval:.*]] = load i32, ptr %[[atomicdst_ascast]] // CHECK-NEXT: ret i32 %[[retval]] return __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_work_group); } diff --git a/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp b/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp index 270671284c3d1..528d27f85e54b 100644 --- a/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp +++ b/clang/test/CodeGenSYCL/kernel-caller-entry-point.cpp @@ -387,11 +387,11 @@ int main() { // CHECK-AMDGCN-NEXT: define dso_local amdgpu_kernel void @_ZTS26single_purpose_kernel_name // CHECK-AMDGCN-SAME: (ptr addrspace(4) noundef byref(%struct.single_purpose_kernel) align 1 %0) #[[AMDGCN_ATTR0:[0-9]+]] { // CHECK-AMDGCN-NEXT: entry: -// CHECK-AMDGCN-NEXT: %kernelFunc = alloca %struct.single_purpose_kernel, align 1, addrspace(5) -// CHECK-AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 %kernelFunc, ptr addrspace(4) align 1 %0, i64 1, i1 false) -// CHECK-AMDGCN-NEXT: %kernelFunc.ascast = addrspacecast ptr addrspace(5) %kernelFunc to ptr +// CHECK-AMDGCN-NEXT: %coerce = alloca %struct.single_purpose_kernel, align 1, addrspace(5) +// CHECK-AMDGCN-NEXT: %kernelFunc = addrspacecast ptr addrspace(5) %coerce to ptr +// CHECK-AMDGCN-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 1 %kernelFunc, ptr addrspace(4) align 1 %0, i64 1, i1 false) // CHECK-AMDGCN-NEXT: call void @_ZNK21single_purpose_kernelclEv -// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 1 dereferenceable(1) %kernelFunc.ascast) #[[AMDGCN_ATTR1:[0-9]+]] +// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 1 dereferenceable(1) %kernelFunc) #[[AMDGCN_ATTR1:[0-9]+]] // CHECK-AMDGCN-NEXT: ret void // CHECK-AMDGCN-NEXT: } // CHECK-AMDGCN: define linkonce_odr void @_ZNK21single_purpose_kernelclEv @@ -425,11 +425,11 @@ int main() { // CHECK-AMDGCN-SAME: (i32 %kernelFunc.coerce) #[[AMDGCN_ATTR0]] { // CHECK-AMDGCN-NEXT: entry: // CHECK-AMDGCN-NEXT: %kernelFunc = alloca %class.anon, align 4, addrspace(5) -// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr addrspace(5) %kernelFunc, i32 0, i32 0 -// CHECK-AMDGCN-NEXT: store i32 %kernelFunc.coerce, ptr addrspace(5) %coerce.dive, align 4 -// CHECK-AMDGCN-NEXT: %kernelFunc.ascast = addrspacecast ptr addrspace(5) %kernelFunc to ptr +// CHECK-AMDGCN-NEXT: %kernelFunc1 = addrspacecast ptr addrspace(5) %kernelFunc to ptr +// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %kernelFunc1, i32 0, i32 0 +// CHECK-AMDGCN-NEXT: store i32 %kernelFunc.coerce, ptr %coerce.dive, align 4 // CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUlT_E_clIiEEDaS_ -// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %kernelFunc.ascast, i32 noundef 42) #[[AMDGCN_ATTR1]] +// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %kernelFunc1, i32 noundef 42) #[[AMDGCN_ATTR1]] // CHECK-AMDGCN-NEXT: ret void // CHECK-AMDGCN-NEXT: } // CHECK-AMDGCN: define internal void @_ZZ4mainENKUlT_E_clIiEEDaS_ @@ -462,11 +462,11 @@ int main() { // CHECK-AMDGCN-NEXT: define dso_local amdgpu_kernel void @"_ZTS6\CE\B4\CF\84\CF\87" // CHECK-AMDGCN-SAME: (ptr addrspace(4) noundef byref(%class.anon.0) align 1 %0) #[[AMDGCN_ATTR0]] { // CHECK-AMDGCN-NEXT: entry: -// CHECK-AMDGCN-NEXT: %kernelFunc = alloca %class.anon.0, align 1, addrspace(5) -// CHECK-AMDGCN-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 %kernelFunc, ptr addrspace(4) align 1 %0, i64 1, i1 false) -// CHECK-AMDGCN-NEXT: %kernelFunc.ascast = addrspacecast ptr addrspace(5) %kernelFunc to ptr +// CHECK-AMDGCN-NEXT: %coerce = alloca %class.anon.0, align 1, addrspace(5) +// CHECK-AMDGCN-NEXT: %kernelFunc = addrspacecast ptr addrspace(5) %coerce to ptr +// CHECK-AMDGCN-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 1 %kernelFunc, ptr addrspace(4) align 1 %0, i64 1, i1 false) // CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUliE_clEi -// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 1 dereferenceable(1) %kernelFunc.ascast, i32 noundef 42) #[[AMDGCN_ATTR1:[0-9]+]] +// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 1 dereferenceable(1) %kernelFunc, i32 noundef 42) #[[AMDGCN_ATTR1:[0-9]+]] // CHECK-AMDGCN-NEXT: ret void // CHECK-AMDGCN-NEXT: } // CHECK-AMDGCN: define internal void @_ZZ4mainENKUliE_clEi @@ -502,18 +502,18 @@ int main() { // CHECK-AMDGCN-NEXT: %k = alloca %class.anon.1, align 4, addrspace(5) // CHECK-AMDGCN-NEXT: %a.addr = alloca i32, align 4, addrspace(5) // CHECK-AMDGCN-NEXT: %b.addr = alloca i32, align 4, addrspace(5) +// CHECK-AMDGCN-NEXT: %k2 = addrspacecast ptr addrspace(5) %k to ptr // CHECK-AMDGCN-NEXT: %a.addr.ascast = addrspacecast ptr addrspace(5) %a.addr to ptr // CHECK-AMDGCN-NEXT: %b.addr.ascast = addrspacecast ptr addrspace(5) %b.addr to ptr -// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon.1, ptr addrspace(5) %k, i32 0, i32 0 -// CHECK-AMDGCN-NEXT: %coerce.dive1 = getelementptr inbounds nuw %struct.copyable, ptr addrspace(5) %coerce.dive, i32 0, i32 0 -// CHECK-AMDGCN-NEXT: store i32 %k.coerce, ptr addrspace(5) %coerce.dive1, align 4 -// CHECK-AMDGCN-NEXT: %k.ascast = addrspacecast ptr addrspace(5) %k to ptr +// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon.1, ptr %k2, i32 0, i32 0 +// CHECK-AMDGCN-NEXT: %coerce.dive1 = getelementptr inbounds nuw %struct.copyable, ptr %coerce.dive, i32 0, i32 0 +// CHECK-AMDGCN-NEXT: store i32 %k.coerce, ptr %coerce.dive1, align 4 // CHECK-AMDGCN-NEXT: store i32 %a, ptr %a.addr.ascast, align 4 // CHECK-AMDGCN-NEXT: store i32 %b, ptr %b.addr.ascast, align 4 // CHECK-AMDGCN-NEXT: %0 = load i32, ptr %a.addr.ascast, align 4 // CHECK-AMDGCN-NEXT: %1 = load i32, ptr %b.addr.ascast, align 4 // CHECK-AMDGCN-NEXT: %call = call noundef i32 @_ZZ4mainENKUliiE_clEii -// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %k.ascast, i32 noundef %0, i32 noundef %1) #[[AMDGCN_ATTR1:[0-9]+]] +// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %k2, i32 noundef %0, i32 noundef %1) #[[AMDGCN_ATTR1:[0-9]+]] // CHECK-AMDGCN-NEXT: ret void // CHECK-AMDGCN-NEXT: } // @@ -539,9 +539,10 @@ int main() { // CHECK-SPIRNV-NEXT: %k.indirect_addr = alloca ptr addrspace(4), align {{[48]}} // CHECK-SPIRNV-NEXT: %a.addr = alloca i32, align 4 // CHECK-SPIRNV-NEXT: %b.addr = alloca i32, align 4 +// CHECK-SPIRNV-NEXT: %k.indirect_addr.ascast = addrspacecast ptr %k.indirect_addr to ptr addrspace(4) // CHECK-SPIRNV-NEXT: %a.addr.ascast = addrspacecast ptr %a.addr to ptr addrspace(4) // CHECK-SPIRNV-NEXT: %b.addr.ascast = addrspacecast ptr %b.addr to ptr addrspace(4) -// CHECK-SPIRNV-NEXT: store ptr %k, ptr %k.indirect_addr, align {{[48]}} +// CHECK-SPIRNV-NEXT: store ptr %k, ptr addrspace(4) %k.indirect_addr.ascast, align {{[48]}} // CHECK-SPIRNV-NEXT: %k.ascast = addrspacecast ptr %k to ptr addrspace(4) // CHECK-SPIRNV-NEXT: store i32 %a, ptr addrspace(4) %a.addr.ascast, align 4 // CHECK-SPIRNV-NEXT: store i32 %b, ptr addrspace(4) %b.addr.ascast, align 4 @@ -578,11 +579,11 @@ int main() { // CHECK-AMDGCN-SAME: (i32 %ref.coerce) #[[AMDGCN_ATTR0]] { // CHECK-AMDGCN-NEXT: entry: // CHECK-AMDGCN-NEXT: %ref = alloca %class.anon, align 4, addrspace(5) -// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr addrspace(5) %ref, i32 0, i32 0 -// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr addrspace(5) %coerce.dive, align 4 -// CHECK-AMDGCN-NEXT: %ref.ascast = addrspacecast ptr addrspace(5) %ref to ptr +// CHECK-AMDGCN-NEXT: %ref1 = addrspacecast ptr addrspace(5) %ref to ptr +// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %ref1, i32 0, i32 0 +// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr %coerce.dive, align 4 // CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUlT_E_clIiEEDaS_ -// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref.ascast, i32 noundef 42) #[[AMDGCN_ATTR1]] +// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref1, i32 noundef 42) #[[AMDGCN_ATTR1]] // CHECK-AMDGCN-NEXT: ret void // CHECK-AMDGCN-NEXT: } // @@ -611,11 +612,11 @@ int main() { // CHECK-AMDGCN-SAME: (i32 %ref.coerce) #[[AMDGCN_ATTR0]] { // CHECK-AMDGCN-NEXT: entry: // CHECK-AMDGCN-NEXT: %ref = alloca %class.anon, align 4, addrspace(5) -// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr addrspace(5) %ref, i32 0, i32 0 -// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr addrspace(5) %coerce.dive, align 4 -// CHECK-AMDGCN-NEXT: %ref.ascast = addrspacecast ptr addrspace(5) %ref to ptr +// CHECK-AMDGCN-NEXT: %ref1 = addrspacecast ptr addrspace(5) %ref to ptr +// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %ref1, i32 0, i32 0 +// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr %coerce.dive, align 4 // CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUlT_E_clIiEEDaS_ -// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref.ascast, i32 noundef 42) #[[AMDGCN_ATTR1]] +// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref1, i32 noundef 42) #[[AMDGCN_ATTR1]] // CHECK-AMDGCN-NEXT: ret void // CHECK-AMDGCN-NEXT: } // @@ -644,11 +645,11 @@ int main() { // CHECK-AMDGCN-SAME: (i32 %ref.coerce) #[[AMDGCN_ATTR0]] { // CHECK-AMDGCN-NEXT: entry: // CHECK-AMDGCN-NEXT: %ref = alloca %class.anon, align 4, addrspace(5) -// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr addrspace(5) %ref, i32 0, i32 0 -// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr addrspace(5) %coerce.dive, align 4 -// CHECK-AMDGCN-NEXT: %ref.ascast = addrspacecast ptr addrspace(5) %ref to ptr +// CHECK-AMDGCN-NEXT: %ref1 = addrspacecast ptr addrspace(5) %ref to ptr +// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon, ptr %ref1, i32 0, i32 0 +// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr %coerce.dive, align 4 // CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUlT_E_clIiEEDaS_ -// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref.ascast, i32 noundef 42) #[[AMDGCN_ATTR1]] +// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref1, i32 noundef 42) #[[AMDGCN_ATTR1]] // CHECK-AMDGCN-NEXT: ret void // CHECK-AMDGCN-NEXT: } // @@ -677,11 +678,11 @@ int main() { // CHECK-AMDGCN-SAME: (i32 %ref.coerce) #[[AMDGCN_ATTR0]] { // CHECK-AMDGCN-NEXT: entry: // CHECK-AMDGCN-NEXT: %ref = alloca %class.anon.2, align 4, addrspace(5) -// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon.2, ptr addrspace(5) %ref, i32 0, i32 0 -// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr addrspace(5) %coerce.dive, align 4 -// CHECK-AMDGCN-NEXT: %ref.ascast = addrspacecast ptr addrspace(5) %ref to ptr +// CHECK-AMDGCN-NEXT: %ref1 = addrspacecast ptr addrspace(5) %ref to ptr +// CHECK-AMDGCN-NEXT: %coerce.dive = getelementptr inbounds nuw %class.anon.2, ptr %ref1, i32 0, i32 0 +// CHECK-AMDGCN-NEXT: store i32 %ref.coerce, ptr %coerce.dive, align 4 // CHECK-AMDGCN-NEXT: call void @_ZZ4mainENKUlT_E0_clIiEEDaS_ -// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref.ascast, i32 noundef 42) #[[AMDGCN_ATTR1]] +// CHECK-AMDGCN-SAME: (ptr noundef nonnull align 4 dereferenceable(4) %ref1, i32 noundef 42) #[[AMDGCN_ATTR1]] // CHECK-AMDGCN-NEXT: ret void // CHECK-AMDGCN-NEXT: } // diff --git a/clang/test/OpenMP/amdgcn_target_device_vla.cpp b/clang/test/OpenMP/amdgcn_target_device_vla.cpp index 5064c114c0863..3bdc95fbc1152 100644 --- a/clang/test/OpenMP/amdgcn_target_device_vla.cpp +++ b/clang/test/OpenMP/amdgcn_target_device_vla.cpp @@ -190,7 +190,9 @@ int main() { // CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr // CHECK-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr // CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr // CHECK-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr +// CHECK-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr // CHECK-NEXT: store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 // CHECK-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8 @@ -203,12 +205,11 @@ int main() { // CHECK: user_code.entry: // CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr)) // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[M_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[M_CASTED]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr addrspace(5) [[M_CASTED]], align 8 +// CHECK-NEXT: store i32 [[TMP4]], ptr [[M_CASTED_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[M_CASTED_ASCAST]], align 8 // CHECK-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr -// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30_omp_outlined(ptr [[TMP6]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR4:[0-9]+]] +// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4 +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR4:[0-9]+]] // CHECK-NEXT: call void @__kmpc_target_deinit() // CHECK-NEXT: ret void // CHECK: worker.exit: @@ -250,6 +251,7 @@ int main() { // CHECK-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr // CHECK-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr // CHECK-NEXT: [[I3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I3]] to ptr +// CHECK-NEXT: [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr // CHECK-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr // CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 // CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8 @@ -307,8 +309,8 @@ int main() { // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 // CHECK-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64 // CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[M_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(5) [[M_CASTED]], align 4 -// CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr addrspace(5) [[M_CASTED]], align 8 +// CHECK-NEXT: store i32 [[TMP19]], ptr [[M_CASTED_ASCAST]], align 4 +// CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[M_CASTED_ASCAST]], align 8 // CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0 // CHECK-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP16]] to ptr // CHECK-NEXT: store ptr [[TMP22]], ptr [[TMP21]], align 8 @@ -557,7 +559,9 @@ int main() { // CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr // CHECK-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr // CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr // CHECK-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr +// CHECK-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr // CHECK-NEXT: store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 // CHECK-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8 @@ -570,12 +574,11 @@ int main() { // CHECK: user_code.entry: // CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[M_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[M_CASTED]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr addrspace(5) [[M_CASTED]], align 8 +// CHECK-NEXT: store i32 [[TMP4]], ptr [[M_CASTED_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[M_CASTED_ASCAST]], align 8 // CHECK-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr -// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l52_omp_outlined(ptr [[TMP6]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR4]] +// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4 +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l52_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR4]] // CHECK-NEXT: call void @__kmpc_target_deinit() // CHECK-NEXT: ret void // CHECK: worker.exit: @@ -916,7 +919,10 @@ int main() { // CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr // CHECK-NEXT: [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr // CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr +// CHECK-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr // CHECK-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr +// CHECK-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr // CHECK-NEXT: store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 @@ -930,15 +936,14 @@ int main() { // CHECK: user_code.entry: // CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[M_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[M_CASTED]], align 4 -// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr addrspace(5) [[M_CASTED]], align 8 +// CHECK-NEXT: store i32 [[TMP4]], ptr [[M_CASTED_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[M_CASTED_ASCAST]], align 8 // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[N_CASTED]], align 4 -// CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8 +// CHECK-NEXT: store i32 [[TMP6]], ptr [[N_CASTED_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8 // CHECK-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4 -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4 -// CHECK-NEXT: [[TMP8:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr -// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l76_omp_outlined(ptr [[TMP8]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR4]] +// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4 +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l76_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]]) #[[ATTR4]] // CHECK-NEXT: call void @__kmpc_target_deinit() // CHECK-NEXT: ret void // CHECK: worker.exit: diff --git a/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c b/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c index 3f9d2225c7de1..eb0e38b5cf2cd 100644 --- a/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c +++ b/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c @@ -30,7 +30,9 @@ void write_to_aligned_array(int *a, int N) { // CHECK-AMD-NEXT: [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr // CHECK-AMD-NEXT: [[APTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[APTR_ADDR]] to ptr // CHECK-AMD-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-AMD-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr // CHECK-AMD-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr +// CHECK-AMD-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr // CHECK-AMD-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8 // CHECK-AMD-NEXT: store ptr [[APTR]], ptr [[APTR_ADDR_ASCAST]], align 8 // CHECK-AMD-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 @@ -40,13 +42,12 @@ void write_to_aligned_array(int *a, int N) { // CHECK-AMD: user_code.entry: // CHECK-AMD-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr)) // CHECK-AMD-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4 -// CHECK-AMD-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[N_CASTED]], align 4 -// CHECK-AMD-NEXT: [[TMP3:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8 +// CHECK-AMD-NEXT: store i32 [[TMP2]], ptr [[N_CASTED_ASCAST]], align 4 +// CHECK-AMD-NEXT: [[TMP3:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8 // CHECK-AMD-NEXT: [[TMP4:%.*]] = load ptr, ptr [[APTR_ADDR_ASCAST]], align 8 // CHECK-AMD-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4 -// CHECK-AMD-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4 -// CHECK-AMD-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr -// CHECK-AMD-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_write_to_aligned_array_l14_omp_outlined(ptr [[TMP5]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP3]], ptr [[TMP4]]) #[[ATTR2:[0-9]+]] +// CHECK-AMD-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4 +// CHECK-AMD-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_write_to_aligned_array_l14_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP3]], ptr [[TMP4]]) #[[ATTR2:[0-9]+]] // CHECK-AMD-NEXT: call void @__kmpc_target_deinit() // CHECK-AMD-NEXT: ret void // CHECK-AMD: worker.exit: @@ -86,6 +87,7 @@ void write_to_aligned_array(int *a, int N) { // CHECK-AMD-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr // CHECK-AMD-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr // CHECK-AMD-NEXT: [[I3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I3]] to ptr +// CHECK-AMD-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr // CHECK-AMD-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr // CHECK-AMD-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 // CHECK-AMD-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8 @@ -140,8 +142,8 @@ void write_to_aligned_array(int *a, int N) { // CHECK-AMD-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 // CHECK-AMD-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 // CHECK-AMD-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4 -// CHECK-AMD-NEXT: store i32 [[TMP17]], ptr addrspace(5) [[N_CASTED]], align 4 -// CHECK-AMD-NEXT: [[TMP18:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8 +// CHECK-AMD-NEXT: store i32 [[TMP17]], ptr [[N_CASTED_ASCAST]], align 4 +// CHECK-AMD-NEXT: [[TMP18:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8 // CHECK-AMD-NEXT: [[TMP19:%.*]] = load ptr, ptr [[APTR_ADDR_ASCAST]], align 8 // CHECK-AMD-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0 // CHECK-AMD-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP14]] to ptr diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp index 0c04b3c429d7a..d6a1280cfcc26 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp @@ -40,7 +40,9 @@ int foo() { // IR-GPU-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr // IR-GPU-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr // IR-GPU-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// IR-GPU-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr // IR-GPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr +// IR-GPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr // IR-GPU-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 @@ -51,12 +53,11 @@ int foo() { // IR-GPU: user_code.entry: // IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr)) // IR-GPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[J_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP3]], ptr addrspace(5) [[J_CASTED]], align 4 -// IR-GPU-NEXT: [[TMP4:%.*]] = load i64, ptr addrspace(5) [[J_CASTED]], align 8 +// IR-GPU-NEXT: store i32 [[TMP3]], ptr [[J_CASTED_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP4:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8 // IR-GPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP2]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4 -// IR-GPU-NEXT: [[TMP5:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr -// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined(ptr [[TMP5]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]] +// IR-GPU-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4 +// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]] // IR-GPU-NEXT: call void @__kmpc_target_deinit() // IR-GPU-NEXT: ret void // IR-GPU: worker.exit: @@ -100,6 +101,7 @@ int foo() { // IR-GPU-NEXT: [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr // IR-GPU-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr // IR-GPU-NEXT: [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr +// IR-GPU-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr // IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr // IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8 @@ -149,8 +151,8 @@ int foo() { // IR-GPU-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 // IR-GPU-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 // IR-GPU-NEXT: [[TMP12:%.*]] = load i32, ptr [[J3_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[J_CASTED]], align 4 -// IR-GPU-NEXT: [[TMP13:%.*]] = load i64, ptr addrspace(5) [[J_CASTED]], align 8 +// IR-GPU-NEXT: store i32 [[TMP12]], ptr [[J_CASTED_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP13:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8 // IR-GPU-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0 // IR-GPU-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP9]] to ptr // IR-GPU-NEXT: store ptr [[TMP15]], ptr [[TMP14]], align 8 diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen_as_distribute.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen_as_distribute.cpp index 3686ca7ea08e0..f484dd140bc6c 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_codegen_as_distribute.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_codegen_as_distribute.cpp @@ -51,7 +51,9 @@ int main() // IR-GPU-NEXT: [[VLA_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR2]] to ptr // IR-GPU-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr // IR-GPU-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr // IR-GPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr +// IR-GPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr // IR-GPU-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 @@ -68,12 +70,11 @@ int main() // IR-GPU: user_code.entry: // IR-GPU-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr)) // IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[N_CASTED]], align 4 -// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8 +// IR-GPU-NEXT: store i32 [[TMP6]], ptr [[N_CASTED_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8 // IR-GPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4 -// IR-GPU-NEXT: [[TMP8:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr -// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_omp_outlined(ptr [[TMP8]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]] +// IR-GPU-NEXT: store i32 [[TMP5]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4 +// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]] // IR-GPU-NEXT: call void @__kmpc_target_deinit() // IR-GPU-NEXT: ret void // IR-GPU: worker.exit: diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen_as_parallel_for.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen_as_parallel_for.cpp index 11ae386739b40..3a3220a170e93 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_codegen_as_parallel_for.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_codegen_as_parallel_for.cpp @@ -89,7 +89,9 @@ int main() // IR-GPU-NEXT: [[VLA_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR2]] to ptr // IR-GPU-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr // IR-GPU-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr // IR-GPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr +// IR-GPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr // IR-GPU-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 @@ -106,12 +108,11 @@ int main() // IR-GPU: user_code.entry: // IR-GPU-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr)) // IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[N_CASTED]], align 4 -// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8 +// IR-GPU-NEXT: store i32 [[TMP6]], ptr [[N_CASTED_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8 // IR-GPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4 -// IR-GPU-NEXT: [[TMP8:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr -// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_omp_outlined(ptr [[TMP8]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]] +// IR-GPU-NEXT: store i32 [[TMP5]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4 +// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]] // IR-GPU-NEXT: call void @__kmpc_target_deinit() // IR-GPU-NEXT: ret void // IR-GPU: worker.exit: @@ -157,6 +158,7 @@ int main() // IR-GPU-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr // IR-GPU-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr // IR-GPU-NEXT: [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr +// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr // IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr // IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8 @@ -218,8 +220,8 @@ int main() // IR-GPU-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 // IR-GPU-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 // IR-GPU-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP21]], ptr addrspace(5) [[N_CASTED]], align 4 -// IR-GPU-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8 +// IR-GPU-NEXT: store i32 [[TMP21]], ptr [[N_CASTED_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP22:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8 // IR-GPU-NEXT: [[TMP23:%.*]] = getelementptr inbounds [7 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0 // IR-GPU-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to ptr // IR-GPU-NEXT: store ptr [[TMP24]], ptr [[TMP23]], align 8 @@ -423,7 +425,9 @@ int main() // IR-GPU-NEXT: [[VLA_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR2]] to ptr // IR-GPU-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr // IR-GPU-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr // IR-GPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr +// IR-GPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr // IR-GPU-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 @@ -440,12 +444,11 @@ int main() // IR-GPU: user_code.entry: // IR-GPU-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) // IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[N_CASTED]], align 4 -// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8 +// IR-GPU-NEXT: store i32 [[TMP6]], ptr [[N_CASTED_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8 // IR-GPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4 -// IR-GPU-NEXT: [[TMP8:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr -// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l46_omp_outlined(ptr [[TMP8]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2]] +// IR-GPU-NEXT: store i32 [[TMP5]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4 +// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l46_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2]] // IR-GPU-NEXT: call void @__kmpc_target_deinit() // IR-GPU-NEXT: ret void // IR-GPU: worker.exit: @@ -499,6 +502,7 @@ int main() // IR-GPU-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr // IR-GPU-NEXT: [[I11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I11]] to ptr // IR-GPU-NEXT: [[J12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J12]] to ptr +// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr // IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr // IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8 @@ -572,8 +576,8 @@ int main() // IR-GPU-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_COMB_LB_ASCAST]], align 8 // IR-GPU-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_COMB_UB_ASCAST]], align 8 // IR-GPU-NEXT: [[TMP22:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[N_CASTED]], align 4 -// IR-GPU-NEXT: [[TMP23:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8 +// IR-GPU-NEXT: store i32 [[TMP22]], ptr [[N_CASTED_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP23:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8 // IR-GPU-NEXT: [[TMP24:%.*]] = getelementptr inbounds [7 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0 // IR-GPU-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP20]] to ptr // IR-GPU-NEXT: store ptr [[TMP25]], ptr [[TMP24]], align 8 @@ -828,7 +832,10 @@ int main() // IR-GPU-NEXT: [[VLA_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR2]] to ptr // IR-GPU-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr // IR-GPU-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr +// IR-GPU-NEXT: [[NT_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NT_CASTED]] to ptr // IR-GPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr +// IR-GPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr // IR-GPU-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store i64 [[NT]], ptr [[NT_ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 @@ -846,15 +853,14 @@ int main() // IR-GPU: user_code.entry: // IR-GPU-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) // IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[N_CASTED]], align 4 -// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8 +// IR-GPU-NEXT: store i32 [[TMP6]], ptr [[N_CASTED_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8 // IR-GPU-NEXT: [[TMP8:%.*]] = load i32, ptr [[NT_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP8]], ptr addrspace(5) [[NT_CASTED]], align 4 -// IR-GPU-NEXT: [[TMP9:%.*]] = load i64, ptr addrspace(5) [[NT_CASTED]], align 8 +// IR-GPU-NEXT: store i32 [[TMP8]], ptr [[NT_CASTED_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP9:%.*]] = load i64, ptr [[NT_CASTED_ASCAST]], align 8 // IR-GPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4 -// IR-GPU-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr -// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55_omp_outlined(ptr [[TMP10]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2]] +// IR-GPU-NEXT: store i32 [[TMP5]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4 +// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l55_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2]] // IR-GPU-NEXT: call void @__kmpc_target_deinit() // IR-GPU-NEXT: ret void // IR-GPU: worker.exit: @@ -903,6 +909,8 @@ int main() // IR-GPU-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr // IR-GPU-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr // IR-GPU-NEXT: [[I5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I5]] to ptr +// IR-GPU-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr +// IR-GPU-NEXT: [[NT_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NT_CASTED]] to ptr // IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr // IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 // IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8 @@ -965,11 +973,11 @@ int main() // IR-GPU-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 // IR-GPU-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 // IR-GPU-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP21]], ptr addrspace(5) [[N_CASTED]], align 4 -// IR-GPU-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8 +// IR-GPU-NEXT: store i32 [[TMP21]], ptr [[N_CASTED_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP22:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8 // IR-GPU-NEXT: [[TMP23:%.*]] = load i32, ptr [[NT_ADDR_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP23]], ptr addrspace(5) [[NT_CASTED]], align 4 -// IR-GPU-NEXT: [[TMP24:%.*]] = load i64, ptr addrspace(5) [[NT_CASTED]], align 8 +// IR-GPU-NEXT: store i32 [[TMP23]], ptr [[NT_CASTED_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP24:%.*]] = load i64, ptr [[NT_CASTED_ASCAST]], align 8 // IR-GPU-NEXT: [[TMP25:%.*]] = getelementptr inbounds [8 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0 // IR-GPU-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP18]] to ptr // IR-GPU-NEXT: store ptr [[TMP26]], ptr [[TMP25]], align 8 @@ -2954,7 +2962,9 @@ int main() // IR-GPU-NESTED-NEXT: [[VLA_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR2]] to ptr // IR-GPU-NESTED-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr // IR-GPU-NESTED-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// IR-GPU-NESTED-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr // IR-GPU-NESTED-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr +// IR-GPU-NESTED-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr // IR-GPU-NESTED-NEXT: store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8 // IR-GPU-NESTED-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 // IR-GPU-NESTED-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 @@ -2971,12 +2981,11 @@ int main() // IR-GPU-NESTED: user_code.entry: // IR-GPU-NESTED-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr)) // IR-GPU-NESTED-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4 -// IR-GPU-NESTED-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[N_CASTED]], align 4 -// IR-GPU-NESTED-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8 +// IR-GPU-NESTED-NEXT: store i32 [[TMP6]], ptr [[N_CASTED_ASCAST]], align 4 +// IR-GPU-NESTED-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8 // IR-GPU-NESTED-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4 -// IR-GPU-NESTED-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[DOTTHREADID_TEMP_]], align 4 -// IR-GPU-NESTED-NEXT: [[TMP8:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr -// IR-GPU-NESTED-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64_omp_outlined(ptr [[TMP8]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]] +// IR-GPU-NESTED-NEXT: store i32 [[TMP5]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4 +// IR-GPU-NESTED-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP7]], i64 [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]] // IR-GPU-NESTED-NEXT: call void @__kmpc_target_deinit() // IR-GPU-NESTED-NEXT: ret void // IR-GPU-NESTED: worker.exit: @@ -3030,6 +3039,7 @@ int main() // IR-GPU-NESTED-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr // IR-GPU-NESTED-NEXT: [[I11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I11]] to ptr // IR-GPU-NESTED-NEXT: [[J12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J12]] to ptr +// IR-GPU-NESTED-NEXT: [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr // IR-GPU-NESTED-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr // IR-GPU-NESTED-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 // IR-GPU-NESTED-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8 @@ -3103,8 +3113,8 @@ int main() // IR-GPU-NESTED-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_COMB_LB_ASCAST]], align 8 // IR-GPU-NESTED-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_COMB_UB_ASCAST]], align 8 // IR-GPU-NESTED-NEXT: [[TMP22:%.*]] = load i32, ptr [[N_ADDR_ASCAST]], align 4 -// IR-GPU-NESTED-NEXT: store i32 [[TMP22]], ptr addrspace(5) [[N_CASTED]], align 4 -// IR-GPU-NESTED-NEXT: [[TMP23:%.*]] = load i64, ptr addrspace(5) [[N_CASTED]], align 8 +// IR-GPU-NESTED-NEXT: store i32 [[TMP22]], ptr [[N_CASTED_ASCAST]], align 4 +// IR-GPU-NESTED-NEXT: [[TMP23:%.*]] = load i64, ptr [[N_CASTED_ASCAST]], align 8 // IR-GPU-NESTED-NEXT: [[TMP24:%.*]] = getelementptr inbounds [7 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0 // IR-GPU-NESTED-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP20]] to ptr // IR-GPU-NESTED-NEXT: store ptr [[TMP25]], ptr [[TMP24]], align 8