Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 117 additions & 22 deletions mlir/utils/pygments/mlir_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,132 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

from pygments.lexer import RegexLexer
from pygments.lexer import RegexLexer, bygroups, include, using
from pygments.token import *
import re


class MlirLexer(RegexLexer):
"""Pygments lexer for MLIR.
This lexer focuses on accurate tokenization of common MLIR constructs:
- SSA values (%%... / %...)
- attribute and type aliases (#name =, !name =)
- types (builtin and dialect types, parametric types)
- attribute dictionaries and nested containers to a reasonable depth
- numbers (ints, floats with exponents, hex)
- strings with common escapes
- line comments (// ...)
- block labels (^foo) and operations
"""

name = "MLIR"
aliases = ["mlir"]
filenames = ["*.mlir"]

flags = re.MULTILINE

class VariableList(RegexLexer):
"""Lexer for lists of SSA variables separated by commas."""

tokens = {
"root": [
(r"\s+", Text),
(r",", Punctuation),
(r"%[_A-Za-z0-9\.\$\-:#]+", Name.Variable),
]
}

tokens = {
"root": [
(r"%[a-zA-Z0-9_]+", Name.Variable),
(r"@[a-zA-Z_][a-zA-Z0-9_]+", Name.Function),
(r"\^[a-zA-Z0-9_]+", Name.Label),
(r"#[a-zA-Z0-9_]+", Name.Constant),
(r"![a-zA-Z0-9_]+", Keyword.Type),
(r"[a-zA-Z_][a-zA-Z0-9_]*\.", Name.Entity),
(r"memref[^.]", Keyword.Type),
(r"index", Keyword.Type),
(r"i[0-9]+", Keyword.Type),
(r"f[0-9]+", Keyword.Type),
# Comments
(r"//.*?$", Comment.Single),
# operation name with assignment: %... = op.name
(
r"^(\s*)(%[\%_A-Za-z0-9\:#\,\s]+)(=)(\s*)([A-Za-z0-9_\.\$\-]+)\b",
bygroups(Text, using(VariableList), Operator, Text, Name.Builtin),
),
# operation name without result
(r"^(\s*)([A-Za-z0-9_\.\$\-]+)\b(?=[^<:])", bygroups(Text, Name.Builtin)),
# Attribute alias definition: #name =
(
r"^(\s*)(#[_A-Za-z0-9\$\-\.]+)(\b)(\s*=)",
bygroups(Text, Name.Constant, Text, Operator),
),
# Type alias definition: !name =
(
r"^(\s*)(![_A-Za-z0-9\$\-\.]+)(\b)(\s*=)",
bygroups(Text, Keyword.Type, Text, Operator),
),
# SSA values (uses)
(r"%[_A-Za-z0-9\.\$\-:#]+", Name.Variable),
# attribute refs, constants and named attributes
(r"#[_A-Za-z0-9\$\-\.]+\b", Name.Constant),
# symbol refs / function-like names
(r"@[_A-Za-z][_A-Za-z0-9\$\-\.]*\b", Name.Function),
# blocks
(r"\^[A-Za-z0-9_\$\.\-]+", Name.Label),
# types by exclamation or builtin names
(r"![_A-Za-z0-9\$\-\.]+\b", Keyword.Type),
# NOTE: please sync changes to corresponding builtin type rule in "angled-type"
(r"\b(bf16|f16|f32|f64|f80|f128|index|none|(u|s)?i[0-9]+)\b", Keyword.Type),
# container-like dialect types (tensor<...>, memref<...>, vector<...>)
(
r"\b(complex|memref|tensor|tuple|vector)\s*(<)",
bygroups(Keyword.Type, Punctuation),
"angled-type",
),
# affine constructs
(r"\b(affine_map|affine_set)\b", Keyword.Reserved),
# common builtin operators / functions inside affine_map
(r"\b(ceildiv|floordiv|mod|symbol)\b", Name.Other),
# identifiers / bare words
(r"\b[_A-Za-z][_A-Za-z0-9\.-]*\b", Name.Other),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know how exactly leading \b is handled, does it require whitespace or some special punctuation?

Copy link
Member Author

@PragmaTwice PragmaTwice Nov 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

\b stands for word boundary. e.g. r"abc" can match "xabcx", but r"\babc\b" cannot.

For example, if we add a rule (r"index", XXX), it can match index in something like create_index %a.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know it is a word boundary. I don't know what precisely it means here. Will it still match in memref<index>? In transform.foo.index?

Copy link
Member Author

@PragmaTwice PragmaTwice Nov 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • for memref<index>, yes it will match since index is a word separated by < and >.
  • for transform.foo.index, if we look at this rule alone, it will match, but the preceding rules might match transform.foo.index as an operation name in advance, so it may not take effect.

# numbers: hex, float (with exponent), integer
(r"\b0x[0-9A-Fa-f]+\b", Number.Hex),
(r"\b([0-9]+(\.[0-9]*)?|\.[0-9]+)([eE][+-]?[0-9]+)?\b", Number.Float),
(r"\b[0-9]+\b", Number.Integer),
# strings
(r'"', String.Double, "string"),
# punctuation and arrow-like tokens
(r"->|>=|<=|\>=|\<=|\->|\=>", Operator),
(r"[()\[\]{}<>,.:=]", Punctuation),
# operators
(r"[-+*/%]", Operator),
],
# string state with common escapes
"string": [
(r'\\[ntr"\\]', String.Escape),
(r'[^"\\]+', String.Double),
(r'"', String.Double, "#pop"),
],
# angled-type content
"angled-type": [
# match nested '<' and '>'
(r"<", Punctuation, "#push"),
(r">", Punctuation, "#pop"),
# dimensions like 3x or 3x3x... and standalone numbers:
# - match numbers that are followed by an 'x' (dimension separator)
(r"([0-9]+)(?=(?:x))", Number.Integer),
# - match bare numbers (sizes)
(r"[0-9]+", Number.Integer),
(r"[0-9]*\.[0-9]*", Number.Float),
(r'"[^"]*"', String.Double),
(r"affine_map", Keyword.Reserved),
# TODO: this should be within affine maps only
(r"\+-\*\/", Operator),
(r"floordiv", Operator.Word),
(r"ceildiv", Operator.Word),
(r"mod", Operator.Word),
(r"()\[\]<>,{}", Punctuation),
(r"\/\/.*\n", Comment.Single),
]
# dynamic dimension '?'
(r"\?", Name.Integer),
# the 'x' dimension separator (treat as punctuation)
(r"x", Punctuation),
# element / builtin types inside angle brackets (no word-boundary)
# NOTE: please sync changes to corresponding builtin type rule in "root"
(
r"(?:bf16|f16|f32|f64|f80|f128|index|none|(?:[us]?i[0-9]+))",
Keyword.Type,
),
# also allow nested container-like types to be recognized
(
r"\b(complex|memref|tensor|tuple|vector)\s*(<)",
bygroups(Keyword.Type, Punctuation),
"angled-type",
),
# fall back to root rules for anything else
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if this is possible, we shouldn't need the special logic above, I think

Copy link
Member Author

@PragmaTwice PragmaTwice Nov 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup but the parsing logic inside and outside the angle is quite different. One of the reason is stated here: #166406 (comment).

include("root"),
],
}