From 412243815fc3937f49a377884f9a77fbff4801bc Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 15 Sep 2025 21:50:40 -0700 Subject: [PATCH 1/2] MC: Better handle backslash-escaped symbols The MCContext::getOrCreateSymbol change in #138817 was a workaround. With #158106, we can replace `getOrCreateSymbol` with `parseSymbol`, in llvm/lib/MC/MCParser to handle backslash-escaped symbols. --- llvm/include/llvm/MC/MCContext.h | 4 ++ llvm/lib/MC/MCContext.cpp | 49 +++++++++++-------- llvm/lib/MC/MCParser/AsmParser.cpp | 12 ++--- llvm/lib/MC/MCParser/COFFMasmParser.cpp | 4 +- llvm/lib/MC/MCParser/ELFAsmParser.cpp | 2 +- llvm/lib/MC/MCParser/MCAsmParser.cpp | 2 +- llvm/lib/MC/MCParser/MCAsmParserExtension.cpp | 4 +- llvm/lib/MC/MCParser/MasmParser.cpp | 13 +++-- llvm/lib/MC/MCParser/WasmAsmParser.cpp | 3 +- llvm/test/MC/ELF/cgprofile.s | 10 ++-- llvm/test/MC/ELF/symbol-names.s | 3 ++ 11 files changed, 59 insertions(+), 47 deletions(-) diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h index d96609c0fec21..4a528eecfc900 100644 --- a/llvm/include/llvm/MC/MCContext.h +++ b/llvm/include/llvm/MC/MCContext.h @@ -484,6 +484,10 @@ class MCContext { /// \param Name - The symbol name, which must be unique across all symbols. LLVM_ABI MCSymbol *getOrCreateSymbol(const Twine &Name); + /// Variant of getOrCreateSymbol that handles backslash-escaped symbols. + /// For example, parse "a\"b\\" as a"\. + LLVM_ABI MCSymbol *parseSymbol(const Twine &Name); + /// Gets a symbol that will be defined to the final stack offset of a local /// variable after codegen. /// diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index 5d9ddc2f1306c..5df8692d2e7ba 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -203,27 +203,6 @@ MCInst *MCContext::createMCInst() { MCSymbol *MCContext::getOrCreateSymbol(const Twine &Name) { SmallString<128> NameSV; StringRef NameRef = Name.toStringRef(NameSV); - if (NameRef.contains('\\')) { - NameSV = NameRef; - size_t S = 0; - // Support escaped \\ and \" as in GNU Assembler. GAS issues a warning for - // other characters following \\, which we do not implement due to code - // structure. - for (size_t I = 0, E = NameSV.size(); I != E; ++I) { - char C = NameSV[I]; - if (C == '\\' && I + 1 != E) { - switch (NameSV[I + 1]) { - case '"': - case '\\': - C = NameSV[++I]; - break; - } - } - NameSV[S++] = C; - } - NameSV.resize(S); - NameRef = NameSV; - } assert(!NameRef.empty() && "Normal symbols cannot be unnamed!"); @@ -244,6 +223,34 @@ MCSymbol *MCContext::getOrCreateSymbol(const Twine &Name) { return Entry.second.Symbol; } +MCSymbol *MCContext::parseSymbol(const Twine &Name) { + SmallString<128> SV; + StringRef NameRef = Name.toStringRef(SV); + if (NameRef.contains('\\')) { + SV = NameRef; + size_t S = 0; + // Support escaped \\ and \" as in GNU Assembler. GAS issues a warning for + // other characters following \\, which we do not implement due to code + // structure. + for (size_t I = 0, E = SV.size(); I != E; ++I) { + char C = SV[I]; + if (C == '\\' && I + 1 != E) { + switch (SV[I + 1]) { + case '"': + case '\\': + C = SV[++I]; + break; + } + } + SV[S++] = C; + } + SV.resize(S); + NameRef = SV; + } + + return getOrCreateSymbol(NameRef); +} + MCSymbol *MCContext::getOrCreateFrameAllocSymbol(const Twine &FuncName, unsigned Idx) { return getOrCreateSymbol(MAI->getPrivateGlobalPrefix() + FuncName + diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 5fa1539790c73..acea3ab23680a 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -1213,8 +1213,8 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, MCSymbol *Sym = getContext().getInlineAsmLabel(SymbolName); if (!Sym) - Sym = getContext().getOrCreateSymbol(MAI.isHLASM() ? SymbolName.upper() - : SymbolName); + Sym = getContext().parseSymbol(MAI.isHLASM() ? SymbolName.upper() + : SymbolName); // If this is an absolute variable reference, substitute it now to preserve // semantics in the face of reassignment. @@ -1845,7 +1845,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, RewrittenLabel); IDVal = RewrittenLabel; } - Sym = getContext().getOrCreateSymbol(IDVal); + Sym = getContext().parseSymbol(IDVal); } else Sym = Ctx.createDirectionalLocalSymbol(LocalLabelVal); // End of Labels should be treated as end of line for lexing @@ -4885,7 +4885,7 @@ bool AsmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) { if (discardLTOSymbol(Name)) return false; - MCSymbol *Sym = getContext().getOrCreateSymbol(Name); + MCSymbol *Sym = getContext().parseSymbol(Name); // Assembler local symbols don't make any sense here, except for directives // that the symbol should be tagged. @@ -6142,7 +6142,7 @@ bool HLASMAsmParser::parseAsHLASMLabel(ParseStatementInfo &Info, return Error(LabelLoc, "Cannot have just a label for an HLASM inline asm statement"); - MCSymbol *Sym = getContext().getOrCreateSymbol( + MCSymbol *Sym = getContext().parseSymbol( getContext().getAsmInfo()->isHLASM() ? LabelVal.upper() : LabelVal); // Emit the label. @@ -6270,7 +6270,7 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef, Parser.getStreamer().emitValueToOffset(Value, 0, EqualLoc); return false; } else - Sym = Parser.getContext().getOrCreateSymbol(Name); + Sym = Parser.getContext().parseSymbol(Name); Sym->setRedefinable(allow_redef); diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp index ef2815b037f2f..04e12e56c4262 100644 --- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp +++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp @@ -510,8 +510,8 @@ bool COFFMasmParser::parseDirectiveAlias(StringRef Directive, SMLoc Loc) { getParser().parseAngleBracketString(ActualName)) return Error(getTok().getLoc(), "expected "); - MCSymbol *Alias = getContext().getOrCreateSymbol(AliasName); - MCSymbol *Actual = getContext().getOrCreateSymbol(ActualName); + MCSymbol *Alias = getContext().parseSymbol(AliasName); + MCSymbol *Actual = getContext().parseSymbol(ActualName); getStreamer().emitWeakReference(Alias, Actual); diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index 19da9f57a4a6f..6195355626fd5 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -163,7 +163,7 @@ bool ELFAsmParser::parseDirectiveSymbolAttribute(StringRef Directive, SMLoc) { continue; } - MCSymbol *Sym = getContext().getOrCreateSymbol(Name); + MCSymbol *Sym = getContext().parseSymbol(Name); getStreamer().emitSymbolAttribute(Sym, Attr); diff --git a/llvm/lib/MC/MCParser/MCAsmParser.cpp b/llvm/lib/MC/MCParser/MCAsmParser.cpp index 3721541c71e11..c1b7e57184de1 100644 --- a/llvm/lib/MC/MCParser/MCAsmParser.cpp +++ b/llvm/lib/MC/MCParser/MCAsmParser.cpp @@ -168,7 +168,7 @@ bool MCAsmParser::parseSymbol(MCSymbol *&Res) { if (parseIdentifier(Name)) return true; - Res = getContext().getOrCreateSymbol(Name); + Res = getContext().parseSymbol(Name); return false; } diff --git a/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp b/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp index 7fa05088c9725..299d4b46a8a84 100644 --- a/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp +++ b/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp @@ -50,8 +50,8 @@ bool MCAsmParserExtension::parseDirectiveCGProfile(StringRef, SMLoc) { if (getLexer().isNot(AsmToken::EndOfStatement)) return TokError("unexpected token in directive"); - MCSymbol *FromSym = getContext().getOrCreateSymbol(From); - MCSymbol *ToSym = getContext().getOrCreateSymbol(To); + MCSymbol *FromSym = getContext().parseSymbol(From); + MCSymbol *ToSym = getContext().parseSymbol(To); getStreamer().emitCGProfileEntry( MCSymbolRefExpr::create(FromSym, getContext(), FromLoc), diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index b38c2f7e41634..7f0ea7830b495 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -1480,7 +1480,7 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, auto VarIt = Variables.find(SymbolName.lower()); if (VarIt != Variables.end()) SymbolName = VarIt->second.Name; - Sym = getContext().getOrCreateSymbol(SymbolName); + Sym = getContext().parseSymbol(SymbolName); } // If this is an absolute variable reference, substitute it now to preserve @@ -1965,7 +1965,7 @@ bool MasmParser::parseStatement(ParseStatementInfo &Info, if (IDVal == "@@") { Sym = Ctx.createDirectionalLocalSymbol(0); } else { - Sym = getContext().getOrCreateSymbol(IDVal); + Sym = getContext().parseSymbol(IDVal); } // End of Labels should be treated as end of line for lexing @@ -3009,8 +3009,7 @@ bool MasmParser::parseDirectiveEquate(StringRef IDVal, StringRef Name, return false; } - auto *Sym = - static_cast(getContext().getOrCreateSymbol(Var.Name)); + auto *Sym = static_cast(getContext().parseSymbol(Var.Name)); const MCConstantExpr *PrevValue = Sym->isVariable() ? dyn_cast_or_null(Sym->getVariableValue()) @@ -3318,7 +3317,7 @@ bool MasmParser::parseDirectiveNamedValue(StringRef TypeName, unsigned Size, StringRef Name, SMLoc NameLoc) { if (StructInProgress.empty()) { // Initialize named data value. - MCSymbol *Sym = getContext().getOrCreateSymbol(Name); + MCSymbol *Sym = getContext().parseSymbol(Name); getStreamer().emitLabel(Sym); unsigned Count; if (emitIntegralValues(Size, &Count)) @@ -3509,7 +3508,7 @@ bool MasmParser::parseDirectiveNamedRealValue(StringRef TypeName, SMLoc NameLoc) { if (StructInProgress.empty()) { // Initialize named data value. - MCSymbol *Sym = getContext().getOrCreateSymbol(Name); + MCSymbol *Sym = getContext().parseSymbol(Name); getStreamer().emitLabel(Sym); unsigned Count; if (emitRealValues(Semantics, &Count)) @@ -4003,7 +4002,7 @@ bool MasmParser::parseDirectiveNamedStructValue(const StructInfo &Structure, SMLoc DirLoc, StringRef Name) { if (StructInProgress.empty()) { // Initialize named data value. - MCSymbol *Sym = getContext().getOrCreateSymbol(Name); + MCSymbol *Sym = getContext().parseSymbol(Name); getStreamer().emitLabel(Sym); unsigned Count; if (emitStructValues(Structure, &Count)) diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp index 1befcacb3952f..75e8948f8ce9e 100644 --- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp +++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp @@ -240,8 +240,7 @@ class WasmAsmParser : public MCAsmParserExtension { return error("Expected label after .type directive, got: ", Lexer->getTok()); auto *WasmSym = static_cast( - getStreamer().getContext().getOrCreateSymbol( - Lexer->getTok().getString())); + getStreamer().getContext().parseSymbol(Lexer->getTok().getString())); Lex(); if (!(isNext(AsmToken::Comma) && isNext(AsmToken::At) && Lexer->is(AsmToken::Identifier))) diff --git a/llvm/test/MC/ELF/cgprofile.s b/llvm/test/MC/ELF/cgprofile.s index f8469ddc68877..28d8b72185556 100644 --- a/llvm/test/MC/ELF/cgprofile.s +++ b/llvm/test/MC/ELF/cgprofile.s @@ -5,11 +5,11 @@ a: .word b .cg_profile a, b, 32 .cg_profile freq, a, 11 - .cg_profile late, late2, 20 + .cg_profile "late\\", late2, 20 .cg_profile .L.local, b, 42 - .globl late -late: + .globl "late\\" +"late\\": late2: .word 0 late3: .L.local: @@ -31,7 +31,7 @@ late3: # CHECK-NEXT: 0010: 14000000 00000000 2A000000 00000000 # CHECK-NEXT: ) -# CHECK: Name: .rel.llvm.call-graph-profile (28) +# CHECK: Name: .rel.llvm.call-graph-profile # CHECK-NEXT: Type: SHT_REL (0x9) # CHECK-NEXT: Flags [ (0x40) # CHECK-NEXT: SHF_INFO_LINK @@ -83,7 +83,7 @@ late3: # CHECK-NEXT: Type: # CHECK-NEXT: Other: # CHECK-NEXT: Section: Undefined -# CHECK: Name: late +# CHECK: Name: late\ ([[#]]) # CHECK-NEXT: Value: # CHECK-NEXT: Size: # CHECK-NEXT: Binding: Global diff --git a/llvm/test/MC/ELF/symbol-names.s b/llvm/test/MC/ELF/symbol-names.s index 427187c329acf..f1593dd2f8099 100644 --- a/llvm/test/MC/ELF/symbol-names.s +++ b/llvm/test/MC/ELF/symbol-names.s @@ -5,6 +5,7 @@ // CHECK-LABEL:SYMBOL TABLE: // CHECK-NEXT: 0000000000000001 l F .text 0000000000000000 a"b\{{$}} // CHECK-NEXT: 0000000000000006 l .text 0000000000000000 a\{{$}} +// CHECK-NEXT: 000000000000000b l .text 0000000000000000 a\\{{$}} // CHECK-NEXT: 0000000000000000 g F .text 0000000000000000 foo?bar // CHECK-NEXT: 0000000000000000 *UND* 0000000000000000 a"b\q{{$}} // CHECK-EMPTY: @@ -26,3 +27,5 @@ ret "a\\": /// GAS emits a warning for \q call "a\"b\q" + +"a\\\\" = . From fc416913f5fc822a43ec175a645a60cdd96c4a4e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 16 Sep 2025 09:05:06 -0700 Subject: [PATCH 2/2] Add CodeGen test --- llvm/test/CodeGen/X86/symbol-name.ll | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 llvm/test/CodeGen/X86/symbol-name.ll diff --git a/llvm/test/CodeGen/X86/symbol-name.ll b/llvm/test/CodeGen/X86/symbol-name.ll new file mode 100644 index 0000000000000..dd9be14fb053e --- /dev/null +++ b/llvm/test/CodeGen/X86/symbol-name.ll @@ -0,0 +1,5 @@ +; RUN: llc < %s -mtriple=x86_64 -relocation-model=pic | FileCheck %s + +; CHECK: .globl "\\\"" +; CHECK-NEXT: "\\\"": +@"\\\22" = constant i8 0