microsoft · pow2clk · Jun 8, 2026 · Jun 9, 2026
diff --git a/include/dxc/DXIL/DxilConstants.h b/include/dxc/DXIL/DxilConstants.h
@@ -2487,6 +2487,8 @@ extern const char *kHostLayoutTypePrefix;
 
 extern const char *kWaveOpsIncludeHelperLanesString;
 
+extern const char *kPreciseString;
+
 } // namespace DXIL
 
 } // namespace hlsl
diff --git a/lib/DXIL/DxilModule.cpp b/lib/DXIL/DxilModule.cpp
@@ -87,6 +87,9 @@ const char *kDxLinAlgMatrixTypePrefix = "dx.types.LinAlgMatrix";
 const char *kHostLayoutTypePrefix = "hostlayout.";
 
 const char *kWaveOpsIncludeHelperLanesString = "waveops-include-helper-lanes";
+
+const char *kPreciseString = "dx.precise";
+
 } // namespace DXIL
 
 void SetDxilHook(Module &M);

diff --git a/lib/HLSL/HLModule.cpp b/lib/HLSL/HLModule.cpp
@@ -1022,19 +1022,18 @@ void HLModule::ClearPreciseAttributeWithMetadata(Instruction *I) {
 }
 
 static void MarkPreciseAttribute(Function *F) {
-  LLVMContext &Ctx = F->getContext();
-  MDNode *preciseNode = MDNode::get(
-      Ctx, {MDString::get(Ctx, DxilMDHelper::kDxilPreciseAttributeMDName)});
-
-  F->setMetadata(DxilMDHelper::kDxilPreciseAttributeMDName, preciseNode);
+  F->addFnAttr(DXIL::kPreciseString);
 }
 
 template <typename BuilderTy>
 void HLModule::MarkPreciseAttributeOnValWithFunctionCall(llvm::Value *V,
                                                          BuilderTy &Builder,
                                                          llvm::Module &M) {
   Type *Ty = V->getType();
-  Type *EltTy = Ty->getScalarType();
+  Type *EltTy = Ty;
+  bool SupportsVectors = M.GetHLModule().GetShaderModel()->IsSM69Plus();
+  if (!SupportsVectors)
+    EltTy = Ty->getScalarType();
 
   // TODO: Only do this on basic types.
 
@@ -1050,7 +1049,8 @@ void HLModule::MarkPreciseAttributeOnValWithFunctionCall(llvm::Value *V,
       cast<Function>(M.getOrInsertFunction(preciseFuncName, preciseFuncTy));
   if (!HLModule::HasPreciseAttribute(preciseFunc))
     MarkPreciseAttribute(preciseFunc);
-  if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
+  if (!SupportsVectors && isa<FixedVectorType>(Ty)) {
+    FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
     for (unsigned i = 0; i < VT->getNumElements(); i++) {
       Value *Elt = Builder.CreateExtractElement(V, i);
       Builder.CreateCall(preciseFunc, {Elt});
@@ -1103,9 +1103,9 @@ void HLModule::MarkPreciseAttributeOnPtrWithFunctionCall(llvm::Value *Ptr,
 }
 
 bool HLModule::HasPreciseAttribute(Function *F) {
-  MDNode *preciseNode =
-      F->getMetadata(DxilMDHelper::kDxilPreciseAttributeMDName);
-  return preciseNode != nullptr;
+  AttributeSet Attributeset = F->getAttributes();
+  return Attributeset.hasAttribute(AttributeSet::FunctionIndex,
+                                   DXIL::kPreciseString);
 }
 
 static void AddDIGlobalVariable(DIBuilder &Builder, DIGlobalVariable *LocDIGV,

diff --git a/lib/Transforms/Scalar/DxilConditionalMem2Reg.cpp b/lib/Transforms/Scalar/DxilConditionalMem2Reg.cpp
@@ -270,6 +270,11 @@ class DxilConditionalMem2Reg : public FunctionPass {
   static bool ScalarizePreciseVectorAlloca(Function &F) {
     BasicBlock *Entry = &*F.begin();
 
+    // No need to scalarize the vector if 6.9 native vector support is available
+    Module *M = F.getParent();
+    if (M->HasHLModule() && M->GetHLModule().GetShaderModel()->IsSM69Plus())
+      return false;
+
     SmallVector<AllocaInst *, 4> PreciseAllocaInsts;
     for (auto it = Entry->begin(); it != Entry->end();) {
       Instruction *I = &*(it++);

diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-precise.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-precise.hlsl
@@ -0,0 +1,23 @@
+// RUN: %dxc -T vs_6_9 %s | FileCheck %s
+// Tests a specific case of a precise native vector requiring extraction
+// and reinsertion during the alloca phase where conditionalmem2reg is concerned.
+// Serves as the source for longvec-precise.ll and its specific pass tests
+
+precise float4 main (float4 pos : POSITION, float4 scale : SCL, float4 shift : OFF) : SV_Position {
+  precise float4 position = pos;
+  // Initial multiplication to avoid optimizaton just using the input scalar
+  // CHECK-NOT: fmul fast
+  // CHECK: [[pos:%.*]] = fmul <4 x float>
+  position = position * scale;
+
+  // CHECK: [[z:%.*]] = extractelement <4 x float> [[pos]], i32 2
+  // CHECK-NOT: fadd fast
+  // CHECK: [[sz:%.*]] = fadd float [[z]], 0x
+  // CHECK: [[spos:%.*]] = insertelement <4 x float> [[pos]], float [[sz]], i32 2
+  position.z += 0.01f;
+  // CHECK-NOT: fadd fast
+  // CHECK: fadd <4 x float> %{{.*}}, [[spos]]
+  position += shift;
+
+  return position;
+}
diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-precise-mem2reg.ll b/tools/clang/test/CodeGenDXIL/passes/longvec-precise-mem2reg.ll
@@ -0,0 +1,112 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxil-cond-mem2reg -S | FileCheck %s
+; Test that conditionalmem2reg does not scalarize precise native vectors
+; as it would pre-6.9 as part of keeping their allocas around to maintain
+; the precise information.
+
+; The checks are just confirming that the precise calls are preserved
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+define void @main(<4 x float>* noalias %arg, <4 x float> %arg1, <4 x float> %arg2, <4 x float> %arg3) #0 {
+entry:
+  %shift.addr = alloca <4 x float>, align 4, !dx.temp !9
+  %scale.addr = alloca <4 x float>, align 4, !dx.temp !9
+  %pos.addr = alloca <4 x float>, align 4, !dx.temp !9
+
+  ; Confirm that position is the only alloca that is preserved
+  ; CHECK-NOT: alloca
+  ; CHECK: %position = alloca <4 x float>, align 4, !dx.precise
+  ; CHECK-NOT: alloca
+  %position = alloca <4 x float>, align 4, !dx.precise !23
+  store <4 x float> %arg3, <4 x float>* %shift.addr, align 4, !tbaa !24
+  store <4 x float> %arg2, <4 x float>* %scale.addr, align 4, !tbaa !24
+  store <4 x float> %arg1, <4 x float>* %pos.addr, align 4, !tbaa !24
+  %tmp = bitcast <4 x float>* %position to i8*
+  call void @llvm.lifetime.start(i64 16, i8* %tmp) #0
+  %tmp4 = load <4 x float>, <4 x float>* %pos.addr, align 4, !tbaa !24
+
+  ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %arg1)
+  ; CHECK-NEXT: store <4 x float> %arg1, <4 x float>* %position, align 4
+  call void @"dx.attribute.precise.<4 x float>"(<4 x float> %tmp4)
+  store <4 x float> %tmp4, <4 x float>* %position, align 4, !tbaa !24
+  %tmp5 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !24
+  %tmp6 = load <4 x float>, <4 x float>* %scale.addr, align 4, !tbaa !24
+  %mul = fmul <4 x float> %tmp5, %tmp6
+
+  ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %mul)
+  ; CHECK-NEXT: store <4 x float> %mul, <4 x float>* %position, align 4
+  call void @"dx.attribute.precise.<4 x float>"(<4 x float> %mul)
+  store <4 x float> %mul, <4 x float>* %position, align 4, !tbaa !24
+  %tmp7 = load <4 x float>, <4 x float>* %position, align 4
+  %tmp8 = extractelement <4 x float> %tmp7, i32 2
+  %add = fadd float %tmp8, 0x3F847AE140000000
+  %tmp9 = getelementptr <4 x float>, <4 x float>* %position, i32 0, i32 2
+
+  ; CHECK: call void @dx.attribute.precise.float(float %add)
+  ; CHECK-NEXT: store float %add, float* %tmp9
+  call void @dx.attribute.precise.float(float %add)
+  store float %add, float* %tmp9
+  %tmp10 = load <4 x float>, <4 x float>* %shift.addr, align 4, !tbaa !24
+  %tmp11 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !24
+  %add1 = fadd <4 x float> %tmp11, %tmp10
+
+  ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %add1)
+  ; CHECK-NEXT: store <4 x float> %add1, <4 x float>* %position, align 4
+  call void @"dx.attribute.precise.<4 x float>"(<4 x float> %add1)
+  store <4 x float> %add1, <4 x float>* %position, align 4, !tbaa !24
+  %tmp12 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !24
+  %tmp13 = bitcast <4 x float>* %position to i8*
+  call void @llvm.lifetime.end(i64 16, i8* %tmp13) #0
+  store <4 x float> %tmp12, <4 x float>* %arg
+  ret void
+}
+
+declare void @"dx.attribute.precise.<4 x float>"(<4 x float>) #1
+
+declare void @dx.attribute.precise.float(float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { "dx.precise" }
+
+!pauseresume = !{!1}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6}
+!dx.entryPoints = !{!19}
+!dx.fnprops = !{!20}
+!dx.options = !{!21, !22}
+
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!3 = !{i32 1, i32 9}
+!4 = !{i32 1, i32 10}
+!5 = !{!"vs", i32 6, i32 9}
+!6 = !{i32 1, void (<4 x float>*, <4 x float>, <4 x float>, <4 x float>)* @main, !7}
+!7 = !{!8, !10, !13, !15, !17}
+!8 = !{i32 0, !9, !9}
+!9 = !{}
+!10 = !{i32 1, !11, !12}
+!11 = !{i32 8, i1 true, i32 4, !"SV_Position", i32 7, i32 9}
+!12 = !{i32 0}
+!13 = !{i32 0, !14, !12}
+!14 = !{i32 4, !"POSITION", i32 7, i32 9}
+!15 = !{i32 0, !16, !12}
+!16 = !{i32 4, !"SCL", i32 7, i32 9}
+!17 = !{i32 0, !18, !12}
+!18 = !{i32 4, !"OFF", i32 7, i32 9}
+!19 = !{void (<4 x float>*, <4 x float>, <4 x float>, <4 x float>)* @main, !"main", null, null, null}
+!20 = !{void (<4 x float>*, <4 x float>, <4 x float>, <4 x float>)* @main, i32 1}
+!21 = !{i32 64}
+!22 = !{i32 -1}
+!23 = !{i32 1}
+!24 = !{!25, !25, i64 0}
+!25 = !{!"omnipotent char", !26, i64 0}
+!26 = !{!"Simple C/C++ TBAA"}
diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-precise-sroa.ll b/tools/clang/test/CodeGenDXIL/passes/longvec-precise-sroa.ll
@@ -0,0 +1,111 @@
+; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
+; Test that precise native vector allocas are marked with a vector overload call
+; to dx.attribute.precise() and not scalar extracted and re-inserted
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%ConstantBuffer = type opaque
+
+@"$Globals" = external constant %ConstantBuffer
+
+; Function Attrs: nounwind
+define <4 x float> @main(<4 x float> %pos, <4 x float> %scale, <4 x float> %shift) #0 {
+bb:
+  %tmp = alloca <4 x float>, align 4, !dx.temp !10
+  %tmp1 = alloca <4 x float>, align 4, !dx.temp !10
+  %tmp2 = alloca <4 x float>, align 4, !dx.temp !10
+  %position = alloca <4 x float>, align 4, !dx.precise !24
+  store <4 x float> %shift, <4 x float>* %tmp, align 4, !tbaa !25
+  store <4 x float> %scale, <4 x float>* %tmp1, align 4, !tbaa !25
+  store <4 x float> %pos, <4 x float>* %tmp2, align 4, !tbaa !25
+  %tmp3 = bitcast <4 x float>* %position to i8* ; line:7 col:3
+  call void @llvm.lifetime.start(i64 16, i8* %tmp3) #0 ; line:7 col:3
+  %tmp4 = load <4 x float>, <4 x float>* %tmp2, align 4, !tbaa !25 ; line:7 col:29
+  ; CHECK: %tmp4 = load <4 x float>, <4 x float>* %tmp2
+  ; CHECK-NOT: extractelement
+  ; CHECK-NOT: dx.attribute.precise.float
+  ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %tmp4)
+
+  store <4 x float> %tmp4, <4 x float>* %position, align 4, !tbaa !25 ; line:7 col:18
+  %tmp5 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !25 ; line:11 col:14
+  %tmp6 = load <4 x float>, <4 x float>* %tmp1, align 4, !tbaa !25 ; line:11 col:25
+  %tmp7 = fmul <4 x float> %tmp5, %tmp6 ; line:11 col:23
+
+  ; CHECK: %tmp7 = fmul <4 x float> %tmp5, %tmp6
+  ; CHECK-NOT: extractelement
+  ; CHECK-NOT: dx.attribute.precise.float
+  ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %tmp7)
+
+  store <4 x float> %tmp7, <4 x float>* %position, align 4, !tbaa !25 ; line:11 col:12
+  %tmp8 = load <4 x float>, <4 x float>* %position, align 4 ; line:17 col:14
+  %tmp9 = extractelement <4 x float> %tmp8, i32 2 ; line:17 col:14
+  %tmp10 = fadd float %tmp9, 0x3F847AE140000000 ; line:17 col:14
+  %tmp11 = load <4 x float>, <4 x float>* %position, align 4 ; line:17 col:14
+  %tmp12 = getelementptr <4 x float>, <4 x float>* %position, i32 0, i32 2 ; line:17 col:14
+
+  ; CHECK: %tmp12 = getelementptr <4 x float>, <4 x float>* %position, i32 0, i32 2
+  ; CHECK-NOT: extractelement
+  ; CHECK-NOT: dx.attribute.precise.float
+  ; CHECK: call void @dx.attribute.precise.float(float %tmp10)
+
+  store float %tmp10, float* %tmp12 ; line:17 col:14
+  %tmp13 = load <4 x float>, <4 x float>* %tmp, align 4, !tbaa !25 ; line:20 col:15
+  %tmp14 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !25 ; line:20 col:12
+  %tmp15 = fadd <4 x float> %tmp14, %tmp13 ; line:20 col:12
+
+  ; CHECK: %tmp15 = fadd <4 x float> %tmp14, %tmp13
+  ; CHECK-NOT: extractelement
+  ; CHECK-NOT: dx.attribute.precise.float
+  ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %tmp15)
+
+  store <4 x float> %tmp15, <4 x float>* %position, align 4, !tbaa !25 ; line:20 col:12
+  %tmp16 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !25 ; line:22 col:10
+  %tmp17 = bitcast <4 x float>* %position to i8* ; line:23 col:1
+  call void @llvm.lifetime.end(i64 16, i8* %tmp17) #0 ; line:23 col:1
+  ret <4 x float> %tmp16 ; line:22 col:3
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+attributes #0 = { nounwind }
+
+!pauseresume = !{!1}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6}
+!dx.entryPoints = !{!17}
+!dx.fnprops = !{!21}
+!dx.options = !{!22, !23}
+
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!3 = !{i32 1, i32 9}
+!4 = !{i32 1, i32 10}
+!5 = !{!"vs", i32 6, i32 9}
+!6 = !{i32 1, <4 x float> (<4 x float>, <4 x float>, <4 x float>)* @main, !7}
+!7 = !{!8, !11, !13, !15}
+!8 = !{i32 1, !9, !10}
+!9 = !{i32 8, i1 true, i32 4, !"SV_Position", i32 7, i32 9, i32 13, i32 4}
+!10 = !{}
+!11 = !{i32 0, !12, !10}
+!12 = !{i32 4, !"POSITION", i32 7, i32 9, i32 13, i32 4}
+!13 = !{i32 0, !14, !10}
+!14 = !{i32 4, !"SCL", i32 7, i32 9, i32 13, i32 4}
+!15 = !{i32 0, !16, !10}
+!16 = !{i32 4, !"OFF", i32 7, i32 9, i32 13, i32 4}
+!17 = !{<4 x float> (<4 x float>, <4 x float>, <4 x float>)* @main, !"main", null, !18, null}
+!18 = !{null, null, !19, null}
+!19 = !{!20}
+!20 = !{i32 0, %ConstantBuffer* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 0, null}
+!21 = !{<4 x float> (<4 x float>, <4 x float>, <4 x float>)* @main, i32 1}
+!22 = !{i32 64}
+!23 = !{i32 -1}
+!24 = !{i32 1}
+!25 = !{!26, !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C/C++ TBAA"}