diff --git a/mlir/utils/pygments/mlir_lexer.py b/mlir/utils/pygments/mlir_lexer.py index 179a058e9110c..4cbe0fe236fc4 100644 --- a/mlir/utils/pygments/mlir_lexer.py +++ b/mlir/utils/pygments/mlir_lexer.py @@ -2,37 +2,132 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -from pygments.lexer import RegexLexer +from pygments.lexer import RegexLexer, bygroups, include, using from pygments.token import * +import re class MlirLexer(RegexLexer): + """Pygments lexer for MLIR. + + This lexer focuses on accurate tokenization of common MLIR constructs: + - SSA values (%%... / %...) + - attribute and type aliases (#name =, !name =) + - types (builtin and dialect types, parametric types) + - attribute dictionaries and nested containers to a reasonable depth + - numbers (ints, floats with exponents, hex) + - strings with common escapes + - line comments (// ...) + - block labels (^foo) and operations + """ + name = "MLIR" aliases = ["mlir"] filenames = ["*.mlir"] + flags = re.MULTILINE + + class VariableList(RegexLexer): + """Lexer for lists of SSA variables separated by commas.""" + + tokens = { + "root": [ + (r"\s+", Text), + (r",", Punctuation), + (r"%[_A-Za-z0-9\.\$\-:#]+", Name.Variable), + ] + } + tokens = { "root": [ - (r"%[a-zA-Z0-9_]+", Name.Variable), - (r"@[a-zA-Z_][a-zA-Z0-9_]+", Name.Function), - (r"\^[a-zA-Z0-9_]+", Name.Label), - (r"#[a-zA-Z0-9_]+", Name.Constant), - (r"![a-zA-Z0-9_]+", Keyword.Type), - (r"[a-zA-Z_][a-zA-Z0-9_]*\.", Name.Entity), - (r"memref[^.]", Keyword.Type), - (r"index", Keyword.Type), - (r"i[0-9]+", Keyword.Type), - (r"f[0-9]+", Keyword.Type), + # Comments + (r"//.*?$", Comment.Single), + # operation name with assignment: %... = op.name + ( + r"^(\s*)(%[\%_A-Za-z0-9\:#\,\s]+)(=)(\s*)([A-Za-z0-9_\.\$\-]+)\b", + bygroups(Text, using(VariableList), Operator, Text, Name.Builtin), + ), + # operation name without result + (r"^(\s*)([A-Za-z0-9_\.\$\-]+)\b(?=[^<:])", bygroups(Text, Name.Builtin)), + # Attribute alias definition: #name = + ( + r"^(\s*)(#[_A-Za-z0-9\$\-\.]+)(\b)(\s*=)", + bygroups(Text, Name.Constant, Text, Operator), + ), + # Type alias definition: !name = + ( + r"^(\s*)(![_A-Za-z0-9\$\-\.]+)(\b)(\s*=)", + bygroups(Text, Keyword.Type, Text, Operator), + ), + # SSA values (uses) + (r"%[_A-Za-z0-9\.\$\-:#]+", Name.Variable), + # attribute refs, constants and named attributes + (r"#[_A-Za-z0-9\$\-\.]+\b", Name.Constant), + # symbol refs / function-like names + (r"@[_A-Za-z][_A-Za-z0-9\$\-\.]*\b", Name.Function), + # blocks + (r"\^[A-Za-z0-9_\$\.\-]+", Name.Label), + # types by exclamation or builtin names + (r"![_A-Za-z0-9\$\-\.]+\b", Keyword.Type), + # NOTE: please sync changes to corresponding builtin type rule in "angled-type" + (r"\b(bf16|f16|f32|f64|f80|f128|index|none|(u|s)?i[0-9]+)\b", Keyword.Type), + # container-like dialect types (tensor<...>, memref<...>, vector<...>) + ( + r"\b(complex|memref|tensor|tuple|vector)\s*(<)", + bygroups(Keyword.Type, Punctuation), + "angled-type", + ), + # affine constructs + (r"\b(affine_map|affine_set)\b", Keyword.Reserved), + # common builtin operators / functions inside affine_map + (r"\b(ceildiv|floordiv|mod|symbol)\b", Name.Other), + # identifiers / bare words + (r"\b[_A-Za-z][_A-Za-z0-9\.-]*\b", Name.Other), + # numbers: hex, float (with exponent), integer + (r"\b0x[0-9A-Fa-f]+\b", Number.Hex), + (r"\b([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][+-]?[0-9]+)?\b", Number.Float), + (r"\b[0-9]+\b", Number.Integer), + # strings + (r'"', String.Double, "string"), + # punctuation and arrow-like tokens + (r"->|>=|<=|\>=|\<=|\->|\=>", Operator), + (r"[()\[\]{}<>,.:=]", Punctuation), + # operators + (r"[-+*/%]", Operator), + ], + # string state with common escapes + "string": [ + (r'\\[ntr"\\]', String.Escape), + (r'[^"\\]+', String.Double), + (r'"', String.Double, "#pop"), + ], + # angled-type content + "angled-type": [ + # match nested '<' and '>' + (r"<", Punctuation, "#push"), + (r">", Punctuation, "#pop"), + # dimensions like 3x or 3x3x... and standalone numbers: + # - match numbers that are followed by an 'x' (dimension separator) + (r"([0-9]+)(?=(?:x))", Number.Integer), + # - match bare numbers (sizes) (r"[0-9]+", Number.Integer), - (r"[0-9]*\.[0-9]*", Number.Float), - (r'"[^"]*"', String.Double), - (r"affine_map", Keyword.Reserved), - # TODO: this should be within affine maps only - (r"\+-\*\/", Operator), - (r"floordiv", Operator.Word), - (r"ceildiv", Operator.Word), - (r"mod", Operator.Word), - (r"()\[\]<>,{}", Punctuation), - (r"\/\/.*\n", Comment.Single), - ] + # dynamic dimension '?' + (r"\?", Name.Integer), + # the 'x' dimension separator (treat as punctuation) + (r"x", Punctuation), + # element / builtin types inside angle brackets (no word-boundary) + # NOTE: please sync changes to corresponding builtin type rule in "root" + ( + r"(?:bf16|f16|f32|f64|f80|f128|index|none|(?:[us]?i[0-9]+))", + Keyword.Type, + ), + # also allow nested container-like types to be recognized + ( + r"\b(complex|memref|tensor|tuple|vector)\s*(<)", + bygroups(Keyword.Type, Punctuation), + "angled-type", + ), + # fall back to root rules for anything else + include("root"), + ], }