llvm · dpaoliello · Sep 18, 2025 · efriedma-quic · Sep 24, 2025 · dpaoliello
diff --git a/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp b/clang/test/CodeGenCXX/microsoft-abi-eh-ip2state.cpp
@@ -40,7 +40,7 @@ void case_calls_dll_import() NO_TAIL {
 // CHECK: .seh_endprologue
 // CHECK: .Limpcall{{[0-9]+}}:
 // CHECK-NEXT: rex64
-// CHECK-NEXT: call __imp_some_dll_import
+// CHECK-NEXT: call qword ptr [rip + __imp_some_dll_import]
 // CHECK-NEXT: nop dword ptr {{\[.*\]}}
 // CHECK-NEXT: nop
 // CHECK-NEXT: .seh_startepilogue

diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -63,6 +63,8 @@ class X86ExpandPseudo : public MachineFunctionPass {
                                MachineBasicBlock::iterator MBBI);
   void expandCALL_RVMARKER(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI);
+  void expandCALL_ImpCall(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI);
   bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
   bool expandMBB(MachineBasicBlock &MBB);
 
@@ -254,6 +256,20 @@ void X86ExpandPseudo::expandCALL_RVMARKER(MachineBasicBlock &MBB,
                    std::next(RtCall->getIterator()));
 }
 
+void X86ExpandPseudo::expandCALL_ImpCall(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI) {
+  // Expand CALL64_ImpCall pseudo to CALL64m.
+  MachineInstr &MI = *MBBI;
+  BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(X86::CALL64m))
+      .addReg(X86::RIP)
+      .addImm(1)
+      .addReg(0)
+      .addGlobalAddress(MI.getOperand(0).getGlobal(), 0,
+                        MI.getOperand(0).getTargetFlags())
+      .addReg(0);
+  MI.eraseFromParent();
+}
+
 /// If \p MBBI is a pseudo instruction, this method expands
 /// it to the corresponding (sequence of) actual instruction(s).
 /// \returns true if \p MBBI has been expanded.
@@ -886,6 +902,9 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case X86::CALL64r_ImpCall:
     MI.setDesc(TII->get(X86::CALL64r));
     return true;
+  case X86::CALL64_ImpCall:
+    expandCALL_ImpCall(MBB, MBBI);
+    return true;
   case X86::ADD32mi_ND:
   case X86::ADD64mi32_ND:
   case X86::SUB32mi_ND:

diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3317,11 +3317,6 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     if (Flag.isSwiftError() || Flag.isPreallocated())
       return false;
 
-  // Can't handle import call optimization.
-  if (Is64Bit &&
-      MF->getFunction().getParent()->getModuleFlag("import-call-optimization"))
-    return false;
-
   SmallVector<MVT, 16> OutVTs;
   SmallVector<Type *, 16> ArgTys;
   SmallVector<Register, 16> ArgRegs;
@@ -3563,6 +3558,17 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   if (CalleeOp) {
     // Register-indirect call.
     unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
+
+    const Module *M = FuncInfo.MF->getFunction().getParent();
+    if (CalleeOp != X86::RAX && Is64Bit &&
+        M->getModuleFlag("import-call-optimization")) {
+      // Import call optimization requires all indirect calls to be via RAX.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+              TII.get(TargetOpcode::COPY), X86::RAX)
+          .addReg(CalleeOp);
+      CalleeOp = X86::RAX;
+    }
+
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(CallOpc))
       .addReg(CalleeOp);
   } else {

diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1313,9 +1313,6 @@ def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 texternalsym:$dst)),
 def : Pat<(X86call_rvmarker (i64 tglobaladdr:$rvfunc), (i64 tglobaladdr:$dst)),
           (CALL64pcrel32_RVMARKER tglobaladdr:$rvfunc, tglobaladdr:$dst)>;
 
-def : Pat<(X86imp_call (i64 tglobaladdr:$dst)),
-          (CALL64pcrel32 tglobaladdr:$dst)>;
-
 // Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
 // can never use callee-saved registers. That is the purpose of the GR64_TC
 // register classes.
@@ -1350,25 +1347,25 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
-          Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+          Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabledOrCFGuardEnabled]>;
 
 def : Pat<(X86tcret GR64_TCW64:$dst, timm:$off),
           (TCRETURN_WIN64ri GR64_TCW64:$dst, timm:$off)>,
-          Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+          Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabledOrCFGuardEnabled]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (TCRETURNri64_ImpCall ptr_rc_tailcall:$dst, timm:$off)>,
-          Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationEnabled]>;
+          Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationEnabledAndCFGuardDisabled]>;
 
 // Don't fold loads into X86tcret requiring more than 6 regs.
 // There wouldn't be enough scratch registers for base+index.
 def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),
           (TCRETURNmi64 addr:$dst, timm:$off)>,
-          Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls]>;
+          Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabledOrCFGuardEnabled]>;
 
 def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off),
           (TCRETURN_WINmi64 addr:$dst, timm:$off)>,
-          Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls]>;
+          Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabledOrCFGuardEnabled]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
           (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, timm:$off)>,

diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
@@ -331,11 +331,11 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
                       Requires<[In64BitMode]>;
   def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
                         "call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
-                      Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationDisabled]>;
+                      Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationDisabledOrCFGuardEnabled]>;
   def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
                         "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
                       Requires<[In64BitMode,FavorMemIndirectCall,
-                                NotUseIndirectThunkCalls]>;
+                                NotUseIndirectThunkCalls,ImportCallOptimizationDisabledOrCFGuardEnabled]>;
 
   // Non-tracking calls for IBT, use with caution.
   let isCodeGenOnly = 1 in {
@@ -433,9 +433,13 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
     PseudoI<(outs), (ins i64imm:$rvfunc, i64i32imm_brtarget:$dst), []>,
             Requires<[In64BitMode]>;
 
+  def CALL64_ImpCall :
+     PseudoI<(outs), (ins i64imm:$dst), [(X86imp_call tglobaladdr:$dst)]>,
+             Requires<[In64BitMode]>;
+
   def CALL64r_ImpCall :
     PseudoI<(outs), (ins GR64_A:$dst), [(X86call GR64_A:$dst)]>,
-            Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationEnabled]>;
+            Requires<[In64BitMode,NotUseIndirectThunkCalls,ImportCallOptimizationEnabledAndCFGuardDisabled]>;
 }
 
 // Conditional tail calls are similar to the above, but they are branches

diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -234,8 +234,10 @@ let RecomputePerFunction = 1 in {
                             "shouldOptForSize(MF)">;
   def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || "
                                         "!Subtarget->hasSSE41()">;
-  def ImportCallOptimizationEnabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
-  def ImportCallOptimizationDisabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
+  def ImportCallOptimizationEnabledAndCFGuardDisabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\") &&"
+                                                                 "!MF->getFunction().getParent()->getModuleFlag(\"cfguard\")">;
+  def ImportCallOptimizationDisabledOrCFGuardEnabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\") ||"
+                                                                 "MF->getFunction().getParent()->getModuleFlag(\"cfguard\")">;
 
   def IsWin64CCFunc : Predicate<"Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
   def IsNotWin64CCFunc : Predicate<"!Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;

diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -2346,7 +2346,8 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   case X86::TAILJMPr64_REX: {
     if (EnableImportCallOptimization) {
-      assert(MI->getOperand(0).getReg() == X86::RAX &&
+      assert((MI->getOperand(0).getReg() == X86::RAX ||
+              MF->getFunction().getParent()->getModuleFlag("cfguard")) &&
              "Indirect tail calls with impcall enabled must go through RAX (as "
              "enforced by TCRETURNImpCallri64)");
       emitLabelAndRecordForImportCallOptimization(
@@ -2547,28 +2548,18 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
     if (IndCSPrefix && MI->hasRegisterImplicitUseOperand(X86::R11))
       EmitAndCountInstruction(MCInstBuilder(X86::CS_PREFIX));
 
-    if (EnableImportCallOptimization && isImportedFunction(MI->getOperand(0))) {
-      emitLabelAndRecordForImportCallOptimization(
-          IMAGE_RETPOLINE_AMD64_IMPORT_CALL);
-
-      MCInst TmpInst;
-      MCInstLowering.Lower(MI, TmpInst);
-
-      // For Import Call Optimization to work, we need a the call instruction
-      // with a rex prefix, and a 5-byte nop after the call instruction.
-      EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
-      emitCallInstruction(TmpInst);
-      emitNop(*OutStreamer, 5, Subtarget);
-      maybeEmitNopAfterCallForWindowsEH(MI);
-      return;
-    }
+    assert(!EnableImportCallOptimization ||
+           !isImportedFunction(MI->getOperand(0)) &&
+               "Calls to imported functions with import call optimization "
+               "should be lowered to CALL64m via CALL64_ImpCall");
 
     break;
 
   case X86::CALL64r:
     if (EnableImportCallOptimization) {
       assert(MI->getOperand(0).getReg() == X86::RAX &&
-             "Indirect calls with impcall enabled must go through RAX (as "
+             "Indirect calls with import call optimization enabled must go "
+             "through RAX (as "
              "enforced by CALL64r_ImpCall)");
 
       emitLabelAndRecordForImportCallOptimization(
@@ -2586,9 +2577,25 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
     break;
 
   case X86::CALL64m:
-    if (EnableImportCallOptimization && isCallToCFGuardFunction(MI)) {
-      emitLabelAndRecordForImportCallOptimization(
-          IMAGE_RETPOLINE_AMD64_CFG_CALL);
+    if (EnableImportCallOptimization) {
+      if (isCallToCFGuardFunction(MI)) {
+        emitLabelAndRecordForImportCallOptimization(
+            IMAGE_RETPOLINE_AMD64_CFG_CALL);
+      } else if (isImportedFunction(MI->getOperand(3))) {
+        emitLabelAndRecordForImportCallOptimization(
+            IMAGE_RETPOLINE_AMD64_IMPORT_CALL);
+
+        MCInst TmpInst;
+        MCInstLowering.Lower(MI, TmpInst);
+
+        // For Import Call Optimization to work, we need a the call instruction
+        // with a rex prefix, and a 5-byte nop after the call instruction.
+        EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
+        emitCallInstruction(TmpInst);
+        emitNop(*OutStreamer, 5, Subtarget);
+        maybeEmitNopAfterCallForWindowsEH(MI);
+        return;
+      }
     }
     break;
 

diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-cfguard.ll
@@ -1,33 +1,90 @@
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s
+
+; FIXME: FastISel is emitting calls to the CFG dispatch function as indirect
+; calls via registers. Normally this would work, but for Import Call it is the
+; incorrect pattern.
+
+@global_func_ptr = external dso_local local_unnamed_addr global ptr, align 8
+declare dllimport void @a() local_unnamed_addr
+declare dllimport void @b() local_unnamed_addr
 
 define dso_local void @normal_call(ptr noundef readonly %func_ptr) local_unnamed_addr section "nc_sect" {
 entry:
+  call void @a()
+  call void @a()
   call void %func_ptr()
+  %0 = load ptr, ptr @global_func_ptr, align 8
+  call void %0()
   ret void
 }
 ; CHECK-LABEL:  normal_call:
-; CHECK:        .Limpcall0:
+; CHECK:          movq    %rcx, %rsi
+; CHECK-NEXT:   .Limpcall0:
+; CHECK-NEXT:     rex64
+; CHECK-NEXT:     callq   *__imp_a(%rip)
+; CHECK-NEXT:     nopl    8(%rax,%rax)
+; CHECK-NEXT:   .Limpcall1:
+; CHECK-NEXT:     rex64
+; CHECK-NEXT:     callq   *__imp_a(%rip)
+; CHECK-NEXT:     nopl    8(%rax,%rax)
+; CHECK-NEXT:     movq    %rsi, %rax
+; CHECK-NEXT:   .Limpcall2:
+; CHECK-NEXT:     callq   *__guard_dispatch_icall_fptr(%rip)
+; CHECK-NEXT:     movq    global_func_ptr(%rip), %rax
+; CHECK-NEXT:   .Limpcall3:
 ; CHECK-NEXT:     callq   *__guard_dispatch_icall_fptr(%rip)
+; CHECK-NEXT:     nop
+
+define dso_local void @tail_call() local_unnamed_addr section "tc_sect" {
+entry:
+  tail call void @b()
+  ret void
+}
+; CHECK-LABEL:  tail_call:
+; CHECK:        .Limpcall4:
+; CHECK-NEXT:     jmp     __imp_b
 
 define dso_local void @tail_call_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
 entry:
   tail call void %func_ptr()
   ret void
 }
 ; CHECK-LABEL:  tail_call_fp:
-; CHECK:        .Limpcall1:
+; CHECK:          movq    %rcx, %rax
+; CHECK-NEXT:   .Limpcall5:
 ; CHECK-NEXT:     rex64 jmpq      *__guard_dispatch_icall_fptr(%rip)
 
+define dso_local void @tail_call_global_fp(ptr noundef readonly %func_ptr) local_unnamed_addr section "tc_sect" {
+entry:
+  %0 = load ptr, ptr @global_func_ptr, align 8
+  tail call void %0()
+  ret void
+}
+; CHECK-LABEL:  tail_call_global_fp:
+; CHECK:          movq    global_func_ptr(%rip), %rax
+; CHECK-NEXT:  .Limpcall6:
+; CHECK-NEXT:     rex64 jmpq *__guard_dispatch_icall_fptr(%rip)
+
 ; CHECK-LABEL  .section   .retplne,"yi"
 ; CHECK-NEXT   .asciz  "RetpolineV1"
-; CHECK-NEXT   .long   16
-; CHECK-NEXT   .secnum tc_sect
-; CHECK-NEXT   .long   10
-; CHECK-NEXT   .secoffset      .Limpcall1
-; CHECK-NEXT   .long   16
+; CHECK-NEXT   .long   40
 ; CHECK-NEXT   .secnum nc_sect
-; CHECK-NEXT   .long   9
+; CHECK-NEXT   .long   3
 ; CHECK-NEXT   .secoffset      .Limpcall0
+; CHECK-NEXT   .long   3
+; CHECK-NEXT   .secoffset      .Limpcall1
+; CHECK-NEXT   .long   9
+; CHECK-NEXT   .secoffset      .Limpcall2
+; CHECK-NEXT   .long   9
+; CHECK-NEXT   .secoffset      .Limpcall3
+; CHECK-NEXT   .long   32
+; CHECK-NEXT   .secnum tc_sect
+; CHECK-NEXT   .long   2
+; CHECK-NEXT   .secoffset      .Limpcall4
+; CHECK-NEXT   .long   4
+; CHECK-NEXT   .secoffset      .Limpcall5
+; CHECK-NEXT   .long   4
+; CHECK-NEXT   .secoffset      .Limpcall6
 
 !llvm.module.flags = !{!0, !1}
 !0 = !{i32 1, !"import-call-optimization", i32 1}

diff --git a/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll b/llvm/test/CodeGen/X86/win-import-call-optimization-jumptable.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
+; RUN: llc --fast-isel -mtriple=x86_64-pc-windows-msvc -o - %s | FileCheck %s
 
 ; CHECK-LABEL:  uses_rax:
 ; CHECK:        .Limpcall0: