diff --git a/llvm/include/llvm/Analysis/InlineCost.h b/llvm/include/llvm/Analysis/InlineCost.h index 22e0b1bc901a4..ed54b0c077b4a 100644 --- a/llvm/include/llvm/Analysis/InlineCost.h +++ b/llvm/include/llvm/Analysis/InlineCost.h @@ -318,6 +318,7 @@ std::optional getInliningCostEstimate( CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref GetAssumptionCache, function_ref GetBFI = nullptr, + function_ref GetTLI = nullptr, ProfileSummaryInfo *PSI = nullptr, OptimizationRemarkEmitter *ORE = nullptr); @@ -327,6 +328,7 @@ std::optional getInliningCostFeatures( CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref GetAssumptionCache, function_ref GetBFI = nullptr, + function_ref GetTLI = nullptr, ProfileSummaryInfo *PSI = nullptr, OptimizationRemarkEmitter *ORE = nullptr); diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index 22bb406c01a4e..32acf23e1d0d0 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -249,6 +249,9 @@ class CallAnalyzer : public InstVisitor { /// Getter for BlockFrequencyInfo function_ref GetBFI; + /// Getter for TargetLibraryInfo + function_ref GetTLI; + /// Profile summary information. ProfileSummaryInfo *PSI; @@ -433,6 +436,7 @@ class CallAnalyzer : public InstVisitor { bool simplifyIntrinsicCallIsConstant(CallBase &CB); bool simplifyIntrinsicCallObjectSize(CallBase &CB); ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); + bool isLoweredToCall(Function *F, CallBase &Call); /// Return true if the given argument to the function being considered for /// inlining has the given attribute set either at the call site or the @@ -492,13 +496,15 @@ class CallAnalyzer : public InstVisitor { bool visitUnreachableInst(UnreachableInst &I); public: - CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI, - function_ref GetAssumptionCache, - function_ref GetBFI = nullptr, - ProfileSummaryInfo *PSI = nullptr, - OptimizationRemarkEmitter *ORE = nullptr) + CallAnalyzer( + Function &Callee, CallBase &Call, const TargetTransformInfo &TTI, + function_ref GetAssumptionCache, + function_ref GetBFI = nullptr, + function_ref GetTLI = nullptr, + ProfileSummaryInfo *PSI = nullptr, + OptimizationRemarkEmitter *ORE = nullptr) : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), - PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE), + GetTLI(GetTLI), PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE), CandidateCall(Call) {} InlineResult analyze(); @@ -688,7 +694,8 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { /// FIXME: if InlineCostCallAnalyzer is derived from, this may need /// to instantiate the derived class. InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, - GetAssumptionCache, GetBFI, PSI, ORE, false); + GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, + false); if (CA.analyze().isSuccess()) { // We were able to inline the indirect call! Subtract the cost from the // threshold to get the bonus we want to apply, but don't go below zero. @@ -1106,10 +1113,12 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { const TargetTransformInfo &TTI, function_ref GetAssumptionCache, function_ref GetBFI = nullptr, + function_ref GetTLI = nullptr, ProfileSummaryInfo *PSI = nullptr, OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true, bool IgnoreThreshold = false) - : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE), + : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, PSI, + ORE), ComputeFullInlineCost(OptComputeFullInlineCost || Params.ComputeFullInlineCost || ORE || isCostBenefitAnalysisEnabled()), @@ -1228,8 +1237,8 @@ class InlineCostFeaturesAnalyzer final : public CallAnalyzer { InlineConstants::IndirectCallThreshold; InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, - GetAssumptionCache, GetBFI, PSI, ORE, false, - true); + GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, + false, true); if (CA.analyze().isSuccess()) { increment(InlineCostFeatureIndex::nested_inline_cost_estimate, CA.getCost()); @@ -1355,9 +1364,11 @@ class InlineCostFeaturesAnalyzer final : public CallAnalyzer { const TargetTransformInfo &TTI, function_ref &GetAssumptionCache, function_ref GetBFI, + function_ref GetTLI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, CallBase &Call) - : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {} + : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, GetTLI, + PSI) {} const InlineCostFeatures &features() const { return Cost; } }; @@ -2260,6 +2271,44 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { return false; } +bool CallAnalyzer::isLoweredToCall(Function *F, CallBase &Call) { + const TargetLibraryInfo *TLI = GetTLI ? &GetTLI(*F) : nullptr; + LibFunc LF; + if (!TLI || !TLI->getLibFunc(*F, LF) || !TLI->has(LF)) + return TTI.isLoweredToCall(F); + + switch (LF) { + case LibFunc_memcpy_chk: + case LibFunc_memmove_chk: + case LibFunc_mempcpy_chk: + case LibFunc_memset_chk: { + // Calls to __memcpy_chk whose length is known to fit within the object + // size will eventually be replaced by inline stores. Therefore, these + // should not incur a call penalty. This is only really relevant on + // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as + // other platforms use memcpy intrinsics, which are already exempt from the + // call penalty. + auto *LenOp = dyn_cast(Call.getOperand(2)); + if (!LenOp) + LenOp = dyn_cast_or_null( + SimplifiedValues.lookup(Call.getOperand(2))); + auto *ObjSizeOp = dyn_cast(Call.getOperand(3)); + if (!ObjSizeOp) + ObjSizeOp = dyn_cast_or_null( + SimplifiedValues.lookup(Call.getOperand(3))); + if (LenOp && ObjSizeOp && + LenOp->getLimitedValue() <= ObjSizeOp->getLimitedValue()) { + return false; + } + break; + } + default: + break; + } + + return TTI.isLoweredToCall(F); +} + bool CallAnalyzer::visitCallBase(CallBase &Call) { if (!onCallBaseVisitStart(Call)) return true; @@ -2341,7 +2390,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) { return false; } - if (TTI.isLoweredToCall(F)) { + if (isLoweredToCall(F, Call)) { onLoweredCall(F, Call, IsIndirectCall); } @@ -2945,6 +2994,7 @@ std::optional llvm::getInliningCostEstimate( CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref GetAssumptionCache, function_ref GetBFI, + function_ref GetTLI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { const InlineParams Params = {/* DefaultThreshold*/ 0, /*HintThreshold*/ {}, @@ -2958,7 +3008,7 @@ std::optional llvm::getInliningCostEstimate( /*EnableDeferral*/ true}; InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI, - GetAssumptionCache, GetBFI, PSI, ORE, true, + GetAssumptionCache, GetBFI, GetTLI, PSI, ORE, true, /*IgnoreThreshold*/ true); auto R = CA.analyze(); if (!R.isSuccess()) @@ -2970,9 +3020,10 @@ std::optional llvm::getInliningCostFeatures( CallBase &Call, TargetTransformInfo &CalleeTTI, function_ref GetAssumptionCache, function_ref GetBFI, + function_ref GetTLI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { - InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, - ORE, *Call.getCalledFunction(), Call); + InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, GetTLI, + PSI, ORE, *Call.getCalledFunction(), Call); auto R = CFA.analyze(); if (!R.isSuccess()) return std::nullopt; @@ -3072,7 +3123,7 @@ InlineCost llvm::getInlineCost( << ")\n"); InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI, - GetAssumptionCache, GetBFI, PSI, ORE); + GetAssumptionCache, GetBFI, GetTLI, PSI, ORE); InlineResult ShouldInline = CA.analyze(); LLVM_DEBUG(CA.dump()); @@ -3263,7 +3314,8 @@ InlineCostAnnotationPrinterPass::run(Function &F, continue; OptimizationRemarkEmitter ORE(CalledFunction); InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI, - GetAssumptionCache, nullptr, &PSI, &ORE); + GetAssumptionCache, nullptr, nullptr, &PSI, + &ORE); ICCA.analyze(); OS << " Analyzing call of " << CalledFunction->getName() << "... (caller:" << CB->getCaller()->getName() << ")\n"; diff --git a/llvm/test/Transforms/Inline/AArch64/memcpy-constant-size.ll b/llvm/test/Transforms/Inline/AArch64/memcpy-constant-size.ll new file mode 100644 index 0000000000000..17f7024ff8905 --- /dev/null +++ b/llvm/test/Transforms/Inline/AArch64/memcpy-constant-size.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt %s -mtriple=arm64-apple-macosx -passes=inline -inline-threshold=2 -inline-call-penalty=5 -S | FileCheck %s + +declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1) +declare ptr @__memcpy_chk(ptr, ptr, i64, i64) +declare ptr @__memmove_chk(ptr, ptr, i64, i64) +declare ptr @__mempcpy_chk(ptr, ptr, i64, i64) +declare ptr @__memset_chk(ptr, i32, i64, i64) + +define void @callee(ptr %dst, ptr %src, i64 %size) { +; CHECK-LABEL: define void @callee +; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[SIZE:%.*]]) { +; CHECK-NEXT: [[OBJSIZE:%.*]] = call i64 @llvm.objectsize.i64.p0(ptr [[DST]], i1 false, i1 true, i1 false) +; CHECK-NEXT: [[CALL_MEMCPY:%.*]] = call ptr @__memcpy_chk(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i64 [[OBJSIZE]]) +; CHECK-NEXT: [[CALL_MEMMOVE:%.*]] = call ptr @__memmove_chk(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i64 [[OBJSIZE]]) +; CHECK-NEXT: [[CALL_MEMPCPY:%.*]] = call ptr @__mempcpy_chk(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i64 [[OBJSIZE]]) +; CHECK-NEXT: [[CALL_MEMSET:%.*]] = call ptr @__memset_chk(ptr [[DST]], i32 0, i64 [[SIZE]], i64 [[OBJSIZE]]) +; CHECK-NEXT: ret void +; + %objsize = call i64 @llvm.objectsize.i64.p0(ptr %dst, i1 false, i1 true, i1 false) + %call.memcpy = call ptr @__memcpy_chk(ptr %dst, ptr %src, i64 %size, i64 %objsize) + %call.memmove = call ptr @__memmove_chk(ptr %dst, ptr %src, i64 %size, i64 %objsize) + %call.mempcpy = call ptr @__mempcpy_chk(ptr %dst, ptr %src, i64 %size, i64 %objsize) + %call.memset = call ptr @__memset_chk(ptr %dst, i32 0, i64 %size, i64 %objsize) + ret void +} + +define void @caller(ptr %dst, ptr %src) { +; CHECK-LABEL: define void @caller +; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[OBJSIZE_I:%.*]] = call i64 @llvm.objectsize.i64.p0(ptr [[DST]], i1 false, i1 true, i1 false) +; CHECK-NEXT: [[CALL_MEMCPY_I:%.*]] = call ptr @__memcpy_chk(ptr [[DST]], ptr [[SRC]], i64 4, i64 [[OBJSIZE_I]]) +; CHECK-NEXT: [[CALL_MEMMOVE_I:%.*]] = call ptr @__memmove_chk(ptr [[DST]], ptr [[SRC]], i64 4, i64 [[OBJSIZE_I]]) +; CHECK-NEXT: [[CALL_MEMPCPY_I:%.*]] = call ptr @__mempcpy_chk(ptr [[DST]], ptr [[SRC]], i64 4, i64 [[OBJSIZE_I]]) +; CHECK-NEXT: [[CALL_MEMSET_I:%.*]] = call ptr @__memset_chk(ptr [[DST]], i32 0, i64 4, i64 [[OBJSIZE_I]]) +; CHECK-NEXT: ret void +; + call void @callee(ptr %dst, ptr %src, i64 4) + ret void +} + +define void @objsize_toosmall_callee(ptr %dst, ptr %src, i64 %size) { +; CHECK-LABEL: define void @objsize_toosmall_callee +; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[SIZE:%.*]]) { +; CHECK-NEXT: [[CALL_MEMCPY:%.*]] = call ptr @__memcpy_chk(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i64 1) +; CHECK-NEXT: [[CALL_MEMMOVE:%.*]] = call ptr @__memmove_chk(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i64 1) +; CHECK-NEXT: [[CALL_MEMPCPY:%.*]] = call ptr @__mempcpy_chk(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i64 1) +; CHECK-NEXT: [[CALL_MEMSET:%.*]] = call ptr @__memset_chk(ptr [[DST]], i32 0, i64 [[SIZE]], i64 1) +; CHECK-NEXT: ret void +; + %call.memcpy = call ptr @__memcpy_chk(ptr %dst, ptr %src, i64 %size, i64 1) + %call.memmove = call ptr @__memmove_chk(ptr %dst, ptr %src, i64 %size, i64 1) + %call.mempcpy = call ptr @__mempcpy_chk(ptr %dst, ptr %src, i64 %size, i64 1) + %call.memset = call ptr @__memset_chk(ptr %dst, i32 0, i64 %size, i64 1) + ret void +} + +define void @objsize_toosmall_caller(ptr %dst, ptr %src) { +; CHECK-LABEL: define void @objsize_toosmall_caller +; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) { +; CHECK-NEXT: call void @objsize_toosmall_callee(ptr [[DST]], ptr [[SRC]], i64 4) +; CHECK-NEXT: ret void +; + call void @objsize_toosmall_callee(ptr %dst, ptr %src, i64 4) + ret void +} + +define void @intrinsics_callee(ptr %dst, ptr %src, i64 %size) { +; CHECK-LABEL: define void @intrinsics_callee +; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[SIZE:%.*]]) { +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[DST]], i8 0, i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 %size, i1 false) + call void @llvm.memmove.p0.p0.i64(ptr %dst, ptr %src, i64 %size, i1 false) + call void @llvm.memset.p0.i64(ptr %dst, i8 0, i64 %size, i1 false) + ret void +} + +define void @intrinsics_caller(ptr %dst, ptr %src) { +; CHECK-LABEL: define void @intrinsics_caller +; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) { +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 4, i1 false) +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr [[DST]], ptr [[SRC]], i64 4, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[DST]], i8 0, i64 4, i1 false) +; CHECK-NEXT: ret void +; + call void @intrinsics_callee(ptr %dst, ptr %src, i64 4) + ret void +} diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll new file mode 100644 index 0000000000000..10b07ad6e7491 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/memcpy-constant-size.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt %s -mtriple=arm64-apple-macosx -passes='default' -inline-threshold=2 -inline-call-penalty=5 -S | FileCheck %s + +declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1) +declare ptr @__memcpy_chk(ptr, ptr, i64, i64) +declare ptr @__memmove_chk(ptr, ptr, i64, i64) +declare ptr @__mempcpy_chk(ptr, ptr, i64, i64) +declare ptr @__memset_chk(ptr, i32, i64, i64) + +define void @callee_memcpy(ptr %dst, ptr %src, i64 %size) { +; CHECK-LABEL: define void @callee_memcpy +; CHECK-SAME: (ptr [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DST]], ptr align 1 [[SRC]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; + %objsize = call i64 @llvm.objectsize.i64.p0(ptr %dst, i1 false, i1 true, i1 false) + %call.memcpy = call ptr @__memcpy_chk(ptr %dst, ptr %src, i64 %size, i64 %objsize) + ret void +} + +define void @callee_memmove(ptr %dst, ptr %src, i64 %size) { +; CHECK-LABEL: define void @callee_memmove +; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: tail call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DST]], ptr align 1 [[SRC]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; + %objsize = call i64 @llvm.objectsize.i64.p0(ptr %dst, i1 false, i1 true, i1 false) + %call.memmove = call ptr @__memmove_chk(ptr %dst, ptr %src, i64 %size, i64 %objsize) + ret void +} + +define void @callee_mempcpy(ptr %dst, ptr %src, i64 %size) { +; CHECK-LABEL: define void @callee_mempcpy +; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DST]], ptr align 1 [[SRC]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; + %objsize = call i64 @llvm.objectsize.i64.p0(ptr %dst, i1 false, i1 true, i1 false) + %call.mempcpy = call ptr @__mempcpy_chk(ptr %dst, ptr %src, i64 %size, i64 %objsize) + ret void +} + +define void @callee_memset(ptr %dst, i64 %size) { +; CHECK-LABEL: define void @callee_memset +; CHECK-SAME: (ptr [[DST:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 1 [[DST]], i8 0, i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; + %objsize = call i64 @llvm.objectsize.i64.p0(ptr %dst, i1 false, i1 true, i1 false) + %call.mempcpy = call ptr @__memset_chk(ptr %dst, i32 0, i64 %size, i64 %objsize) + ret void +} + +define void @caller_memcpy(ptr %dst, ptr %src) { +; CHECK-LABEL: define void @caller_memcpy +; CHECK-SAME: (ptr [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC]], align 1 +; CHECK-NEXT: store i32 [[TMP1]], ptr [[DST]], align 1 +; CHECK-NEXT: ret void +; + call void @callee_memcpy(ptr %dst, ptr %src, i64 4) + ret void +} + +define void @caller_memmove(ptr %dst, ptr %src) { +; CHECK-LABEL: define void @caller_memmove +; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC]], align 1 +; CHECK-NEXT: store i32 [[TMP1]], ptr [[DST]], align 1 +; CHECK-NEXT: ret void +; + call void @callee_memmove(ptr %dst, ptr %src, i64 4) + ret void +} + +define void @caller_mempcpy(ptr %dst, ptr %src) { +; CHECK-LABEL: define void @caller_mempcpy +; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC]], align 1 +; CHECK-NEXT: store i32 [[TMP1]], ptr [[DST]], align 1 +; CHECK-NEXT: ret void +; + call void @callee_mempcpy(ptr %dst, ptr %src, i64 4) + ret void +} + +define void @caller_memset(ptr %dst) { +; CHECK-LABEL: define void @caller_memset +; CHECK-SAME: (ptr [[DST:%.*]]) local_unnamed_addr #[[ATTR0]] { +; CHECK-NEXT: store i32 0, ptr [[DST]], align 1 +; CHECK-NEXT: ret void +; + call void @callee_memset(ptr %dst, i64 4) + ret void +}