Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" | |
| "http://www.w3.org/TR/html4/strict.dtd"> | |
| <html> | |
| <head> | |
| <title>Write your own lexer — Pygments</title> | |
| <meta http-equiv="content-type" content="text/html; charset=utf-8"> | |
| <style type="text/css"> | |
| body { | |
| background-color: #f2f2f2; | |
| margin: 0; | |
| padding: 0; | |
| font-family: 'Georgia', serif; | |
| color: #111; | |
| } | |
| #content { | |
| background-color: white; | |
| padding: 20px; | |
| margin: 20px auto 20px auto; | |
| max-width: 800px; | |
| border: 4px solid #ddd; | |
| } | |
| h1 { | |
| font-weight: normal; | |
| font-size: 40px; | |
| color: #09839A; | |
| } | |
| h2 { | |
| font-weight: normal; | |
| font-size: 30px; | |
| color: #C73F00; | |
| } | |
| h1.heading { | |
| margin: 0 0 30px 0; | |
| } | |
| h2.subheading { | |
| margin: -30px 0 0 45px; | |
| } | |
| h3 { | |
| margin-top: 30px; | |
| } | |
| table.docutils { | |
| border-collapse: collapse; | |
| border: 2px solid #aaa; | |
| margin: 0.5em 1.5em 0.5em 1.5em; | |
| } | |
| table.docutils td { | |
| padding: 2px; | |
| border: 1px solid #ddd; | |
| } | |
| p, li, dd, dt, blockquote { | |
| font-size: 15px; | |
| color: #333; | |
| } | |
| p { | |
| line-height: 150%; | |
| margin-bottom: 0; | |
| margin-top: 10px; | |
| } | |
| hr { | |
| border-top: 1px solid #ccc; | |
| border-bottom: 0; | |
| border-right: 0; | |
| border-left: 0; | |
| margin-bottom: 10px; | |
| margin-top: 20px; | |
| } | |
| dl { | |
| margin-left: 10px; | |
| } | |
| li, dt { | |
| margin-top: 5px; | |
| } | |
| dt { | |
| font-weight: bold; | |
| } | |
| th { | |
| text-align: left; | |
| } | |
| a { | |
| color: #990000; | |
| } | |
| a:hover { | |
| color: #c73f00; | |
| } | |
| pre { | |
| background-color: #f9f9f9; | |
| border-top: 1px solid #ccc; | |
| border-bottom: 1px solid #ccc; | |
| padding: 5px; | |
| font-size: 13px; | |
| font-family: Bitstream Vera Sans Mono,monospace; | |
| } | |
| tt { | |
| font-size: 13px; | |
| font-family: Bitstream Vera Sans Mono,monospace; | |
| color: black; | |
| padding: 1px 2px 1px 2px; | |
| background-color: #f0f0f0; | |
| } | |
| cite { | |
| /* abusing <cite>, it's generated by ReST for `x` */ | |
| font-size: 13px; | |
| font-family: Bitstream Vera Sans Mono,monospace; | |
| font-weight: bold; | |
| font-style: normal; | |
| } | |
| #backlink { | |
| float: right; | |
| font-size: 11px; | |
| color: #888; | |
| } | |
| div.toc { | |
| margin: 0 0 10px 0; | |
| } | |
| div.toc h2 { | |
| font-size: 20px; | |
| } | |
| .syntax .hll { background-color: #ffffcc } | |
| .syntax { background: #ffffff; } | |
| .syntax .c { color: #888888 } /* Comment */ | |
| .syntax .err { color: #a61717; background-color: #e3d2d2 } /* Error */ | |
| .syntax .k { color: #008800; font-weight: bold } /* Keyword */ | |
| .syntax .cm { color: #888888 } /* Comment.Multiline */ | |
| .syntax .cp { color: #cc0000; font-weight: bold } /* Comment.Preproc */ | |
| .syntax .c1 { color: #888888 } /* Comment.Single */ | |
| .syntax .cs { color: #cc0000; font-weight: bold; background-color: #fff0f0 } /* Comment.Special */ | |
| .syntax .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */ | |
| .syntax .ge { font-style: italic } /* Generic.Emph */ | |
| .syntax .gr { color: #aa0000 } /* Generic.Error */ | |
| .syntax .gh { color: #333333 } /* Generic.Heading */ | |
| .syntax .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */ | |
| .syntax .go { color: #888888 } /* Generic.Output */ | |
| .syntax .gp { color: #555555 } /* Generic.Prompt */ | |
| .syntax .gs { font-weight: bold } /* Generic.Strong */ | |
| .syntax .gu { color: #666666 } /* Generic.Subheading */ | |
| .syntax .gt { color: #aa0000 } /* Generic.Traceback */ | |
| .syntax .kc { color: #008800; font-weight: bold } /* Keyword.Constant */ | |
| .syntax .kd { color: #008800; font-weight: bold } /* Keyword.Declaration */ | |
| .syntax .kn { color: #008800; font-weight: bold } /* Keyword.Namespace */ | |
| .syntax .kp { color: #008800 } /* Keyword.Pseudo */ | |
| .syntax .kr { color: #008800; font-weight: bold } /* Keyword.Reserved */ | |
| .syntax .kt { color: #888888; font-weight: bold } /* Keyword.Type */ | |
| .syntax .m { color: #0000DD; font-weight: bold } /* Literal.Number */ | |
| .syntax .s { color: #dd2200; background-color: #fff0f0 } /* Literal.String */ | |
| .syntax .na { color: #336699 } /* Name.Attribute */ | |
| .syntax .nb { color: #003388 } /* Name.Builtin */ | |
| .syntax .nc { color: #bb0066; font-weight: bold } /* Name.Class */ | |
| .syntax .no { color: #003366; font-weight: bold } /* Name.Constant */ | |
| .syntax .nd { color: #555555 } /* Name.Decorator */ | |
| .syntax .ne { color: #bb0066; font-weight: bold } /* Name.Exception */ | |
| .syntax .nf { color: #0066bb; font-weight: bold } /* Name.Function */ | |
| .syntax .nl { color: #336699; font-style: italic } /* Name.Label */ | |
| .syntax .nn { color: #bb0066; font-weight: bold } /* Name.Namespace */ | |
| .syntax .py { color: #336699; font-weight: bold } /* Name.Property */ | |
| .syntax .nt { color: #bb0066; font-weight: bold } /* Name.Tag */ | |
| .syntax .nv { color: #336699 } /* Name.Variable */ | |
| .syntax .ow { color: #008800 } /* Operator.Word */ | |
| .syntax .w { color: #bbbbbb } /* Text.Whitespace */ | |
| .syntax .mf { color: #0000DD; font-weight: bold } /* Literal.Number.Float */ | |
| .syntax .mh { color: #0000DD; font-weight: bold } /* Literal.Number.Hex */ | |
| .syntax .mi { color: #0000DD; font-weight: bold } /* Literal.Number.Integer */ | |
| .syntax .mo { color: #0000DD; font-weight: bold } /* Literal.Number.Oct */ | |
| .syntax .sb { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Backtick */ | |
| .syntax .sc { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Char */ | |
| .syntax .sd { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Doc */ | |
| .syntax .s2 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Double */ | |
| .syntax .se { color: #0044dd; background-color: #fff0f0 } /* Literal.String.Escape */ | |
| .syntax .sh { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Heredoc */ | |
| .syntax .si { color: #3333bb; background-color: #fff0f0 } /* Literal.String.Interpol */ | |
| .syntax .sx { color: #22bb22; background-color: #f0fff0 } /* Literal.String.Other */ | |
| .syntax .sr { color: #008800; background-color: #fff0ff } /* Literal.String.Regex */ | |
| .syntax .s1 { color: #dd2200; background-color: #fff0f0 } /* Literal.String.Single */ | |
| .syntax .ss { color: #aa6600; background-color: #fff0f0 } /* Literal.String.Symbol */ | |
| .syntax .bp { color: #003388 } /* Name.Builtin.Pseudo */ | |
| .syntax .vc { color: #336699 } /* Name.Variable.Class */ | |
| .syntax .vg { color: #dd7700 } /* Name.Variable.Global */ | |
| .syntax .vi { color: #3333bb } /* Name.Variable.Instance */ | |
| .syntax .il { color: #0000DD; font-weight: bold } /* Literal.Number.Integer.Long */ | |
| </style> | |
| </head> | |
| <body> | |
| <div id="content"> | |
| <h1 class="heading">Pygments</h1> | |
| <h2 class="subheading">Write your own lexer</h2> | |
| <a id="backlink" href="index.html">« Back To Index</a> | |
| <div class="toc"> | |
| <h2>Contents</h2> | |
| <ul class="contents"> | |
| <li><a href="#regexlexer">RegexLexer</a></li> | |
| <li><a href="#regex-flags">Regex Flags</a></li> | |
| <li><a href="#scanning-multiple-tokens-at-once">Scanning multiple tokens at once</a></li> | |
| <li><a href="#changing-states">Changing states</a></li> | |
| <li><a href="#advanced-state-tricks">Advanced state tricks</a></li> | |
| <li><a href="#using-multiple-lexers">Using multiple lexers</a></li> | |
| <li><a href="#delegating-lexer">Delegating Lexer</a></li> | |
| <li><a href="#callbacks">Callbacks</a></li> | |
| <li><a href="#the-extendedregexlexer-class">The ExtendedRegexLexer class</a></li> | |
| <li><a href="#filtering-token-streams">Filtering Token Streams</a></li> | |
| </ul> | |
| </div> | |
| <!-- -*- mode: rst -*- --> | |
| <p>If a lexer for your favorite language is missing in the Pygments package, you can | |
| easily write your own and extend Pygments.</p> | |
| <p>All you need can be found inside the <cite>pygments.lexer</cite> module. As you can read in | |
| the <a class="reference external" href="./api.html">API documentation</a>, a lexer is a class that is initialized with | |
| some keyword arguments (the lexer options) and that provides a | |
| <cite>get_tokens_unprocessed()</cite> method which is given a string or unicode object with | |
| the data to parse.</p> | |
| <p>The <cite>get_tokens_unprocessed()</cite> method must return an iterator or iterable | |
| containing tuples in the form <tt class="docutils literal">(index, token, value)</tt>. Normally you don't need | |
| to do this since there are numerous base lexers you can subclass.</p> | |
| <div class="section" id="regexlexer"> | |
| <h3>RegexLexer</h3> | |
| <p>A very powerful (but quite easy to use) lexer is the <cite>RegexLexer</cite>. This lexer | |
| base class allows you to define lexing rules in terms of <em>regular expressions</em> | |
| for different <em>states</em>.</p> | |
| <p>States are groups of regular expressions that are matched against the input | |
| string at the <em>current position</em>. If one of these expressions matches, a | |
| corresponding action is performed (normally yielding a token with a specific | |
| type), the current position is set to where the last match ended and the | |
| matching process continues with the first regex of the current state.</p> | |
| <p>Lexer states are kept in a state stack: each time a new state is entered, the | |
| new state is pushed onto the stack. The most basic lexers (like the | |
| <cite>DiffLexer</cite>) just need one state.</p> | |
| <p>Each state is defined as a list of tuples in the form (<cite>regex</cite>, <cite>action</cite>, | |
| <cite>new_state</cite>) where the last item is optional. In the most basic form, <cite>action</cite> | |
| is a token type (like <cite>Name.Builtin</cite>). That means: When <cite>regex</cite> matches, emit a | |
| token with the match text and type <cite>tokentype</cite> and push <cite>new_state</cite> on the state | |
| stack. If the new state is <tt class="docutils literal">'#pop'</tt>, the topmost state is popped from the | |
| stack instead. (To pop more than one state, use <tt class="docutils literal">'#pop:2'</tt> and so on.) | |
| <tt class="docutils literal">'#push'</tt> is a synonym for pushing the current state on the | |
| stack.</p> | |
| <p>The following example shows the <cite>DiffLexer</cite> from the builtin lexers. Note that | |
| it contains some additional attributes <cite>name</cite>, <cite>aliases</cite> and <cite>filenames</cite> which | |
| aren't required for a lexer. They are used by the builtin lexer lookup | |
| functions.</p> | |
| <div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span> | |
| <span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="o">*</span> | |
| <span class="k">class</span> <span class="nc">DiffLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span> | |
| <span class="n">name</span> <span class="o">=</span> <span class="s">'Diff'</span> | |
| <span class="n">aliases</span> <span class="o">=</span> <span class="p">[</span><span class="s">'diff'</span><span class="p">]</span> | |
| <span class="n">filenames</span> <span class="o">=</span> <span class="p">[</span><span class="s">'*.diff'</span><span class="p">]</span> | |
| <span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span> | |
| <span class="s">'root'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">r' .*\n'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'\+.*\n'</span><span class="p">,</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Inserted</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'-.*\n'</span><span class="p">,</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Deleted</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'@.*\n'</span><span class="p">,</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Subheading</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'Index.*\n'</span><span class="p">,</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Heading</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'=.*\n'</span><span class="p">,</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Heading</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'.*\n'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span> | |
| <span class="p">]</span> | |
| <span class="p">}</span> | |
| </pre></div> | |
| <p>As you can see this lexer only uses one state. When the lexer starts scanning | |
| the text, it first checks if the current character is a space. If this is true | |
| it scans everything until newline and returns the parsed data as <cite>Text</cite> token.</p> | |
| <p>If this rule doesn't match, it checks if the current char is a plus sign. And | |
| so on.</p> | |
| <p>If no rule matches at the current position, the current char is emitted as an | |
| <cite>Error</cite> token that indicates a parsing error, and the position is increased by | |
| 1.</p> | |
| </div> | |
| <div class="section" id="regex-flags"> | |
| <h3>Regex Flags</h3> | |
| <p>You can either define regex flags in the regex (<tt class="docutils literal"><span class="pre">r'(?x)foo</span> bar'</tt>) or by adding | |
| a <cite>flags</cite> attribute to your lexer class. If no attribute is defined, it defaults | |
| to <cite>re.MULTILINE</cite>. For more informations about regular expression flags see the | |
| <a class="reference external" href="http://docs.python.org/lib/re-syntax.html">regular expressions</a> help page in the python documentation.</p> | |
| </div> | |
| <div class="section" id="scanning-multiple-tokens-at-once"> | |
| <h3>Scanning multiple tokens at once</h3> | |
| <p>Here is a more complex lexer that highlights INI files. INI files consist of | |
| sections, comments and key = value pairs:</p> | |
| <div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span><span class="p">,</span> <span class="n">bygroups</span> | |
| <span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="o">*</span> | |
| <span class="k">class</span> <span class="nc">IniLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span> | |
| <span class="n">name</span> <span class="o">=</span> <span class="s">'INI'</span> | |
| <span class="n">aliases</span> <span class="o">=</span> <span class="p">[</span><span class="s">'ini'</span><span class="p">,</span> <span class="s">'cfg'</span><span class="p">]</span> | |
| <span class="n">filenames</span> <span class="o">=</span> <span class="p">[</span><span class="s">'*.ini'</span><span class="p">,</span> <span class="s">'*.cfg'</span><span class="p">]</span> | |
| <span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span> | |
| <span class="s">'root'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">r'\s+'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r';.*?$'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'\[.*?\]$'</span><span class="p">,</span> <span class="n">Keyword</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'(.*?)(\s*)(=)(\s*)(.*?)$'</span><span class="p">,</span> | |
| <span class="n">bygroups</span><span class="p">(</span><span class="n">Name</span><span class="o">.</span><span class="n">Attribute</span><span class="p">,</span> <span class="n">Text</span><span class="p">,</span> <span class="n">Operator</span><span class="p">,</span> <span class="n">Text</span><span class="p">,</span> <span class="n">String</span><span class="p">))</span> | |
| <span class="p">]</span> | |
| <span class="p">}</span> | |
| </pre></div> | |
| <p>The lexer first looks for whitespace, comments and section names. And later it | |
| looks for a line that looks like a key, value pair, separated by an <tt class="docutils literal">'='</tt> | |
| sign, and optional whitespace.</p> | |
| <p>The <cite>bygroups</cite> helper makes sure that each group is yielded with a different | |
| token type. First the <cite>Name.Attribute</cite> token, then a <cite>Text</cite> token for the | |
| optional whitespace, after that a <cite>Operator</cite> token for the equals sign. Then a | |
| <cite>Text</cite> token for the whitespace again. The rest of the line is returned as | |
| <cite>String</cite>.</p> | |
| <p>Note that for this to work, every part of the match must be inside a capturing | |
| group (a <tt class="docutils literal"><span class="pre">(...)</span></tt>), and there must not be any nested capturing groups. If you | |
| nevertheless need a group, use a non-capturing group defined using this syntax: | |
| <tt class="docutils literal"><span class="pre">r'(?:some|words|here)'</span></tt> (note the <tt class="docutils literal"><span class="pre">?:</span></tt> after the beginning parenthesis).</p> | |
| <p>If you find yourself needing a capturing group inside the regex which | |
| shouldn't be part of the output but is used in the regular expressions for | |
| backreferencing (eg: <tt class="docutils literal"><span class="pre">r'(<(foo|bar)>)(.*?)(</\2>)'</span></tt>), you can pass <cite>None</cite> | |
| to the bygroups function and it will skip that group will be skipped in the | |
| output.</p> | |
| </div> | |
| <div class="section" id="changing-states"> | |
| <h3>Changing states</h3> | |
| <p>Many lexers need multiple states to work as expected. For example, some | |
| languages allow multiline comments to be nested. Since this is a recursive | |
| pattern it's impossible to lex just using regular expressions.</p> | |
| <p>Here is the solution:</p> | |
| <div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span> | |
| <span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="o">*</span> | |
| <span class="k">class</span> <span class="nc">ExampleLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span> | |
| <span class="n">name</span> <span class="o">=</span> <span class="s">'Example Lexer with states'</span> | |
| <span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span> | |
| <span class="s">'root'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">r'[^/]+'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'/\*'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Multiline</span><span class="p">,</span> <span class="s">'comment'</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'//.*?$'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Singleline</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'/'</span><span class="p">,</span> <span class="n">Text</span><span class="p">)</span> | |
| <span class="p">],</span> | |
| <span class="s">'comment'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">r'[^*/]'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Multiline</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'/\*'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Multiline</span><span class="p">,</span> <span class="s">'#push'</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'\*/'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Multiline</span><span class="p">,</span> <span class="s">'#pop'</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'[*/]'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Multiline</span><span class="p">)</span> | |
| <span class="p">]</span> | |
| <span class="p">}</span> | |
| </pre></div> | |
| <p>This lexer starts lexing in the <tt class="docutils literal">'root'</tt> state. It tries to match as much as | |
| possible until it finds a slash (<tt class="docutils literal">'/'</tt>). If the next character after the slash | |
| is a star (<tt class="docutils literal">'*'</tt>) the <cite>RegexLexer</cite> sends those two characters to the output | |
| stream marked as <cite>Comment.Multiline</cite> and continues parsing with the rules | |
| defined in the <tt class="docutils literal">'comment'</tt> state.</p> | |
| <p>If there wasn't a star after the slash, the <cite>RegexLexer</cite> checks if it's a | |
| singleline comment (eg: followed by a second slash). If this also wasn't the | |
| case it must be a single slash (the separate regex for a single slash must also | |
| be given, else the slash would be marked as an error token).</p> | |
| <p>Inside the <tt class="docutils literal">'comment'</tt> state, we do the same thing again. Scan until the lexer | |
| finds a star or slash. If it's the opening of a multiline comment, push the | |
| <tt class="docutils literal">'comment'</tt> state on the stack and continue scanning, again in the | |
| <tt class="docutils literal">'comment'</tt> state. Else, check if it's the end of the multiline comment. If | |
| yes, pop one state from the stack.</p> | |
| <p>Note: If you pop from an empty stack you'll get an <cite>IndexError</cite>. (There is an | |
| easy way to prevent this from happening: don't <tt class="docutils literal">'#pop'</tt> in the root state).</p> | |
| <p>If the <cite>RegexLexer</cite> encounters a newline that is flagged as an error token, the | |
| stack is emptied and the lexer continues scanning in the <tt class="docutils literal">'root'</tt> state. This | |
| helps producing error-tolerant highlighting for erroneous input, e.g. when a | |
| single-line string is not closed.</p> | |
| </div> | |
| <div class="section" id="advanced-state-tricks"> | |
| <h3>Advanced state tricks</h3> | |
| <p>There are a few more things you can do with states:</p> | |
| <ul> | |
| <li><p class="first">You can push multiple states onto the stack if you give a tuple instead of a | |
| simple string as the third item in a rule tuple. For example, if you want to | |
| match a comment containing a directive, something like:</p> | |
| <pre class="literal-block"> | |
| /* <processing directive> rest of comment */ | |
| </pre> | |
| <p>you can use this rule:</p> | |
| <div class="syntax"><pre><span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span> | |
| <span class="s">'root'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">r'/\* <'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">,</span> <span class="p">(</span><span class="s">'comment'</span><span class="p">,</span> <span class="s">'directive'</span><span class="p">)),</span> | |
| <span class="o">...</span> | |
| <span class="p">],</span> | |
| <span class="s">'directive'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">r'[^>]*'</span><span class="p">,</span> <span class="n">Comment</span><span class="o">.</span><span class="n">Directive</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'>'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">,</span> <span class="s">'#pop'</span><span class="p">),</span> | |
| <span class="p">],</span> | |
| <span class="s">'comment'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">r'[^*]+'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'\*/'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">,</span> <span class="s">'#pop'</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'\*'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">),</span> | |
| <span class="p">]</span> | |
| <span class="p">}</span> | |
| </pre></div> | |
| <p>When this encounters the above sample, first <tt class="docutils literal">'comment'</tt> and <tt class="docutils literal">'directive'</tt> | |
| are pushed onto the stack, then the lexer continues in the directive state | |
| until it finds the closing <tt class="docutils literal">></tt>, then it continues in the comment state until | |
| the closing <tt class="docutils literal">*/</tt>. Then, both states are popped from the stack again and | |
| lexing continues in the root state.</p> | |
| <p><em>New in Pygments 0.9:</em> The tuple can contain the special <tt class="docutils literal">'#push'</tt> and | |
| <tt class="docutils literal">'#pop'</tt> (but not <tt class="docutils literal">'#pop:n'</tt>) directives.</p> | |
| </li> | |
| <li><p class="first">You can include the rules of a state in the definition of another. This is | |
| done by using <cite>include</cite> from <cite>pygments.lexer</cite>:</p> | |
| <div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span><span class="p">,</span> <span class="n">bygroups</span><span class="p">,</span> <span class="n">include</span> | |
| <span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="o">*</span> | |
| <span class="k">class</span> <span class="nc">ExampleLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span> | |
| <span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span> | |
| <span class="s">'comments'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">r'/\*.*?\*/'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'//.*?\n'</span><span class="p">,</span> <span class="n">Comment</span><span class="p">),</span> | |
| <span class="p">],</span> | |
| <span class="s">'root'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="n">include</span><span class="p">(</span><span class="s">'comments'</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'(function )(\w+)( {)'</span><span class="p">,</span> | |
| <span class="n">bygroups</span><span class="p">(</span><span class="n">Keyword</span><span class="p">,</span> <span class="n">Name</span><span class="p">,</span> <span class="n">Keyword</span><span class="p">),</span> <span class="s">'function'</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'.'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span> | |
| <span class="p">],</span> | |
| <span class="s">'function'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">r'[^}/]+'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span> | |
| <span class="n">include</span><span class="p">(</span><span class="s">'comments'</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'/'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'}'</span><span class="p">,</span> <span class="n">Keyword</span><span class="p">,</span> <span class="s">'#pop'</span><span class="p">),</span> | |
| <span class="p">]</span> | |
| <span class="p">}</span> | |
| </pre></div> | |
| <p>This is a hypothetical lexer for a language that consist of functions and | |
| comments. Because comments can occur at toplevel and in functions, we need | |
| rules for comments in both states. As you can see, the <cite>include</cite> helper saves | |
| repeating rules that occur more than once (in this example, the state | |
| <tt class="docutils literal">'comment'</tt> will never be entered by the lexer, as it's only there to be | |
| included in <tt class="docutils literal">'root'</tt> and <tt class="docutils literal">'function'</tt>).</p> | |
| </li> | |
| <li><p class="first">Sometimes, you may want to "combine" a state from existing ones. This is | |
| possible with the <cite>combine</cite> helper from <cite>pygments.lexer</cite>.</p> | |
| <p>If you, instead of a new state, write <tt class="docutils literal"><span class="pre">combined('state1',</span> 'state2')</tt> as the | |
| third item of a rule tuple, a new anonymous state will be formed from state1 | |
| and state2 and if the rule matches, the lexer will enter this state.</p> | |
| <p>This is not used very often, but can be helpful in some cases, such as the | |
| <cite>PythonLexer</cite>'s string literal processing.</p> | |
| </li> | |
| <li><p class="first">If you want your lexer to start lexing in a different state you can modify | |
| the stack by overloading the <cite>get_tokens_unprocessed()</cite> method:</p> | |
| <div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span> | |
| <span class="k">class</span> <span class="nc">MyLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span> | |
| <span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span><span class="o">...</span><span class="p">}</span> | |
| <span class="k">def</span> <span class="nf">get_tokens_unprocessed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">):</span> | |
| <span class="n">stack</span> <span class="o">=</span> <span class="p">[</span><span class="s">'root'</span><span class="p">,</span> <span class="s">'otherstate'</span><span class="p">]</span> | |
| <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">RegexLexer</span><span class="o">.</span><span class="n">get_tokens_unprocessed</span><span class="p">(</span><span class="n">text</span><span class="p">,</span> <span class="n">stack</span><span class="p">):</span> | |
| <span class="k">yield</span> <span class="n">item</span> | |
| </pre></div> | |
| <p>Some lexers like the <cite>PhpLexer</cite> use this to make the leading <tt class="docutils literal"><span class="pre"><?php</span></tt> | |
| preprocessor comments optional. Note that you can crash the lexer easily | |
| by putting values into the stack that don't exist in the token map. Also | |
| removing <tt class="docutils literal">'root'</tt> from the stack can result in strange errors!</p> | |
| </li> | |
| <li><p class="first">An empty regex at the end of a state list, combined with <tt class="docutils literal">'#pop'</tt>, can | |
| act as a return point from a state that doesn't have a clear end marker.</p> | |
| </li> | |
| </ul> | |
| </div> | |
| <div class="section" id="using-multiple-lexers"> | |
| <h3>Using multiple lexers</h3> | |
| <p>Using multiple lexers for the same input can be tricky. One of the easiest | |
| combination techniques is shown here: You can replace the token type entry in a | |
| rule tuple (the second item) with a lexer class. The matched text will then be | |
| lexed with that lexer, and the resulting tokens will be yielded.</p> | |
| <p>For example, look at this stripped-down HTML lexer:</p> | |
| <div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span><span class="p">,</span> <span class="n">bygroups</span><span class="p">,</span> <span class="n">using</span> | |
| <span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="o">*</span> | |
| <span class="kn">from</span> <span class="nn">pygments.lexers.web</span> <span class="kn">import</span> <span class="n">JavascriptLexer</span> | |
| <span class="k">class</span> <span class="nc">HtmlLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span> | |
| <span class="n">name</span> <span class="o">=</span> <span class="s">'HTML'</span> | |
| <span class="n">aliases</span> <span class="o">=</span> <span class="p">[</span><span class="s">'html'</span><span class="p">]</span> | |
| <span class="n">filenames</span> <span class="o">=</span> <span class="p">[</span><span class="s">'*.html'</span><span class="p">,</span> <span class="s">'*.htm'</span><span class="p">]</span> | |
| <span class="n">flags</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">IGNORECASE</span> <span class="o">|</span> <span class="n">re</span><span class="o">.</span><span class="n">DOTALL</span> | |
| <span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span> | |
| <span class="s">'root'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">'[^<&]+'</span><span class="p">,</span> <span class="n">Text</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">'&.*?;'</span><span class="p">,</span> <span class="n">Name</span><span class="o">.</span><span class="n">Entity</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'<\s*script\s*'</span><span class="p">,</span> <span class="n">Name</span><span class="o">.</span><span class="n">Tag</span><span class="p">,</span> <span class="p">(</span><span class="s">'script-content'</span><span class="p">,</span> <span class="s">'tag'</span><span class="p">)),</span> | |
| <span class="p">(</span><span class="s">r'<\s*[a-zA-Z0-9:]+'</span><span class="p">,</span> <span class="n">Name</span><span class="o">.</span><span class="n">Tag</span><span class="p">,</span> <span class="s">'tag'</span><span class="p">),</span> | |
| <span class="p">(</span><span class="s">r'<\s*/\s*[a-zA-Z0-9:]+\s*>'</span><span class="p">,</span> <span class="n">Name</span><span class="o">.</span><span class="n">Tag</span><span class="p">),</span> | |
| <span class="p">],</span> | |
| <span class="s">'script-content'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">r'(.+?)(<\s*/\s*script\s*>)'</span><span class="p">,</span> | |
| <span class="n">bygroups</span><span class="p">(</span><span class="n">using</span><span class="p">(</span><span class="n">JavascriptLexer</span><span class="p">),</span> <span class="n">Name</span><span class="o">.</span><span class="n">Tag</span><span class="p">),</span> | |
| <span class="s">'#pop'</span><span class="p">),</span> | |
| <span class="p">]</span> | |
| <span class="p">}</span> | |
| </pre></div> | |
| <p>Here the content of a <tt class="docutils literal"><script></tt> tag is passed to a newly created instance of | |
| a <cite>JavascriptLexer</cite> and not processed by the <cite>HtmlLexer</cite>. This is done using the | |
| <cite>using</cite> helper that takes the other lexer class as its parameter.</p> | |
| <p>Note the combination of <cite>bygroups</cite> and <cite>using</cite>. This makes sure that the content | |
| up to the <tt class="docutils literal"></script></tt> end tag is processed by the <cite>JavascriptLexer</cite>, while the | |
| end tag is yielded as a normal token with the <cite>Name.Tag</cite> type.</p> | |
| <p>As an additional goodie, if the lexer class is replaced by <cite>this</cite> (imported from | |
| <cite>pygments.lexer</cite>), the "other" lexer will be the current one (because you cannot | |
| refer to the current class within the code that runs at class definition time).</p> | |
| <p>Also note the <tt class="docutils literal"><span class="pre">(r'<\s*script\s*',</span> Name.Tag, <span class="pre">('script-content',</span> <span class="pre">'tag'))</span></tt> rule. | |
| Here, two states are pushed onto the state stack, <tt class="docutils literal"><span class="pre">'script-content'</span></tt> and | |
| <tt class="docutils literal">'tag'</tt>. That means that first <tt class="docutils literal">'tag'</tt> is processed, which will parse | |
| attributes and the closing <tt class="docutils literal">></tt>, then the <tt class="docutils literal">'tag'</tt> state is popped and the | |
| next state on top of the stack will be <tt class="docutils literal"><span class="pre">'script-content'</span></tt>.</p> | |
| <p>The <cite>using()</cite> helper has a special keyword argument, <cite>state</cite>, which works as | |
| follows: if given, the lexer to use initially is not in the <tt class="docutils literal">"root"</tt> state, | |
| but in the state given by this argument. This <em>only</em> works with a <cite>RegexLexer</cite>.</p> | |
| <p>Any other keywords arguments passed to <cite>using()</cite> are added to the keyword | |
| arguments used to create the lexer.</p> | |
| </div> | |
| <div class="section" id="delegating-lexer"> | |
| <h3>Delegating Lexer</h3> | |
| <p>Another approach for nested lexers is the <cite>DelegatingLexer</cite> which is for | |
| example used for the template engine lexers. It takes two lexers as | |
| arguments on initialisation: a <cite>root_lexer</cite> and a <cite>language_lexer</cite>.</p> | |
| <p>The input is processed as follows: First, the whole text is lexed with the | |
| <cite>language_lexer</cite>. All tokens yielded with a type of <tt class="docutils literal">Other</tt> are then | |
| concatenated and given to the <cite>root_lexer</cite>. The language tokens of the | |
| <cite>language_lexer</cite> are then inserted into the <cite>root_lexer</cite>'s token stream | |
| at the appropriate positions.</p> | |
| <div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">DelegatingLexer</span> | |
| <span class="kn">from</span> <span class="nn">pygments.lexers.web</span> <span class="kn">import</span> <span class="n">HtmlLexer</span><span class="p">,</span> <span class="n">PhpLexer</span> | |
| <span class="k">class</span> <span class="nc">HtmlPhpLexer</span><span class="p">(</span><span class="n">DelegatingLexer</span><span class="p">):</span> | |
| <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">):</span> | |
| <span class="nb">super</span><span class="p">(</span><span class="n">HtmlPhpLexer</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">__init__</span><span class="p">(</span><span class="n">HtmlLexer</span><span class="p">,</span> <span class="n">PhpLexer</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span> | |
| </pre></div> | |
| <p>This procedure ensures that e.g. HTML with template tags in it is highlighted | |
| correctly even if the template tags are put into HTML tags or attributes.</p> | |
| <p>If you want to change the needle token <tt class="docutils literal">Other</tt> to something else, you can | |
| give the lexer another token type as the third parameter:</p> | |
| <div class="syntax"><pre><span class="n">DelegatingLexer</span><span class="o">.</span><span class="n">__init__</span><span class="p">(</span><span class="n">MyLexer</span><span class="p">,</span> <span class="n">OtherLexer</span><span class="p">,</span> <span class="n">Text</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span> | |
| </pre></div> | |
| </div> | |
| <div class="section" id="callbacks"> | |
| <h3>Callbacks</h3> | |
| <p>Sometimes the grammar of a language is so complex that a lexer would be unable | |
| to parse it just by using regular expressions and stacks.</p> | |
| <p>For this, the <cite>RegexLexer</cite> allows callbacks to be given in rule tuples, instead | |
| of token types (<cite>bygroups</cite> and <cite>using</cite> are nothing else but preimplemented | |
| callbacks). The callback must be a function taking two arguments:</p> | |
| <ul class="simple"> | |
| <li>the lexer itself</li> | |
| <li>the match object for the last matched rule</li> | |
| </ul> | |
| <p>The callback must then return an iterable of (or simply yield) <tt class="docutils literal">(index, | |
| tokentype, value)</tt> tuples, which are then just passed through by | |
| <cite>get_tokens_unprocessed()</cite>. The <tt class="docutils literal">index</tt> here is the position of the token in | |
| the input string, <tt class="docutils literal">tokentype</tt> is the normal token type (like <cite>Name.Builtin</cite>), | |
| and <tt class="docutils literal">value</tt> the associated part of the input string.</p> | |
| <p>You can see an example here:</p> | |
| <div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">RegexLexer</span> | |
| <span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="n">Generic</span> | |
| <span class="k">class</span> <span class="nc">HypotheticLexer</span><span class="p">(</span><span class="n">RegexLexer</span><span class="p">):</span> | |
| <span class="k">def</span> <span class="nf">headline_callback</span><span class="p">(</span><span class="n">lexer</span><span class="p">,</span> <span class="n">match</span><span class="p">):</span> | |
| <span class="n">equal_signs</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> | |
| <span class="n">text</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> | |
| <span class="k">yield</span> <span class="n">match</span><span class="o">.</span><span class="n">start</span><span class="p">(),</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Headline</span><span class="p">,</span> <span class="n">equal_signs</span> <span class="o">+</span> <span class="n">text</span> <span class="o">+</span> <span class="n">equal_signs</span> | |
| <span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span> | |
| <span class="s">'root'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">r'(=+)(.*?)(\1)'</span><span class="p">,</span> <span class="n">headline_callback</span><span class="p">)</span> | |
| <span class="p">]</span> | |
| <span class="p">}</span> | |
| </pre></div> | |
| <p>If the regex for the <cite>headline_callback</cite> matches, the function is called with the | |
| match object. Note that after the callback is done, processing continues | |
| normally, that is, after the end of the previous match. The callback has no | |
| possibility to influence the position.</p> | |
| <p>There are not really any simple examples for lexer callbacks, but you can see | |
| them in action e.g. in the <a class="reference external" href="http://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/compiled.py">compiled.py</a> source code in the <cite>CLexer</cite> and | |
| <cite>JavaLexer</cite> classes.</p> | |
| </div> | |
| <div class="section" id="the-extendedregexlexer-class"> | |
| <h3>The ExtendedRegexLexer class</h3> | |
| <p>The <cite>RegexLexer</cite>, even with callbacks, unfortunately isn't powerful enough for | |
| the funky syntax rules of some languages that will go unnamed, such as Ruby.</p> | |
| <p>But fear not; even then you don't have to abandon the regular expression | |
| approach. For Pygments has a subclass of <cite>RegexLexer</cite>, the <cite>ExtendedRegexLexer</cite>. | |
| All features known from RegexLexers are available here too, and the tokens are | |
| specified in exactly the same way, <em>except</em> for one detail:</p> | |
| <p>The <cite>get_tokens_unprocessed()</cite> method holds its internal state data not as local | |
| variables, but in an instance of the <cite>pygments.lexer.LexerContext</cite> class, and | |
| that instance is passed to callbacks as a third argument. This means that you | |
| can modify the lexer state in callbacks.</p> | |
| <p>The <cite>LexerContext</cite> class has the following members:</p> | |
| <ul class="simple"> | |
| <li><cite>text</cite> -- the input text</li> | |
| <li><cite>pos</cite> -- the current starting position that is used for matching regexes</li> | |
| <li><cite>stack</cite> -- a list containing the state stack</li> | |
| <li><cite>end</cite> -- the maximum position to which regexes are matched, this defaults to | |
| the length of <cite>text</cite></li> | |
| </ul> | |
| <p>Additionally, the <cite>get_tokens_unprocessed()</cite> method can be given a | |
| <cite>LexerContext</cite> instead of a string and will then process this context instead of | |
| creating a new one for the string argument.</p> | |
| <p>Note that because you can set the current position to anything in the callback, | |
| it won't be automatically be set by the caller after the callback is finished. | |
| For example, this is how the hypothetical lexer above would be written with the | |
| <cite>ExtendedRegexLexer</cite>:</p> | |
| <div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexer</span> <span class="kn">import</span> <span class="n">ExtendedRegexLexer</span> | |
| <span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="n">Generic</span> | |
| <span class="k">class</span> <span class="nc">ExHypotheticLexer</span><span class="p">(</span><span class="n">ExtendedRegexLexer</span><span class="p">):</span> | |
| <span class="k">def</span> <span class="nf">headline_callback</span><span class="p">(</span><span class="n">lexer</span><span class="p">,</span> <span class="n">match</span><span class="p">,</span> <span class="n">ctx</span><span class="p">):</span> | |
| <span class="n">equal_signs</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> | |
| <span class="n">text</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> | |
| <span class="k">yield</span> <span class="n">match</span><span class="o">.</span><span class="n">start</span><span class="p">(),</span> <span class="n">Generic</span><span class="o">.</span><span class="n">Headline</span><span class="p">,</span> <span class="n">equal_signs</span> <span class="o">+</span> <span class="n">text</span> <span class="o">+</span> <span class="n">equal_signs</span> | |
| <span class="n">ctx</span><span class="o">.</span><span class="n">pos</span> <span class="o">=</span> <span class="n">match</span><span class="o">.</span><span class="n">end</span><span class="p">()</span> | |
| <span class="n">tokens</span> <span class="o">=</span> <span class="p">{</span> | |
| <span class="s">'root'</span><span class="p">:</span> <span class="p">[</span> | |
| <span class="p">(</span><span class="s">r'(=+)(.*?)(\1)'</span><span class="p">,</span> <span class="n">headline_callback</span><span class="p">)</span> | |
| <span class="p">]</span> | |
| <span class="p">}</span> | |
| </pre></div> | |
| <p>This might sound confusing (and it can really be). But it is needed, and for an | |
| example look at the Ruby lexer in <a class="reference external" href="https://bitbucket.org/birkenfeld/pygments-main/src/tip/pygments/lexers/agile.py">agile.py</a>.</p> | |
| </div> | |
| <div class="section" id="filtering-token-streams"> | |
| <h3>Filtering Token Streams</h3> | |
| <p>Some languages ship a lot of builtin functions (for example PHP). The total | |
| amount of those functions differs from system to system because not everybody | |
| has every extension installed. In the case of PHP there are over 3000 builtin | |
| functions. That's an incredible huge amount of functions, much more than you | |
| can put into a regular expression.</p> | |
| <p>But because only <cite>Name</cite> tokens can be function names it's solvable by overriding | |
| the <tt class="docutils literal">get_tokens_unprocessed()</tt> method. The following lexer subclasses the | |
| <cite>PythonLexer</cite> so that it highlights some additional names as pseudo keywords:</p> | |
| <div class="syntax"><pre><span class="kn">from</span> <span class="nn">pygments.lexers.agile</span> <span class="kn">import</span> <span class="n">PythonLexer</span> | |
| <span class="kn">from</span> <span class="nn">pygments.token</span> <span class="kn">import</span> <span class="n">Name</span><span class="p">,</span> <span class="n">Keyword</span> | |
| <span class="k">class</span> <span class="nc">MyPythonLexer</span><span class="p">(</span><span class="n">PythonLexer</span><span class="p">):</span> | |
| <span class="n">EXTRA_KEYWORDS</span> <span class="o">=</span> <span class="p">[</span><span class="s">'foo'</span><span class="p">,</span> <span class="s">'bar'</span><span class="p">,</span> <span class="s">'foobar'</span><span class="p">,</span> <span class="s">'barfoo'</span><span class="p">,</span> <span class="s">'spam'</span><span class="p">,</span> <span class="s">'eggs'</span><span class="p">]</span> | |
| <span class="k">def</span> <span class="nf">get_tokens_unprocessed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">):</span> | |
| <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">token</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">PythonLexer</span><span class="o">.</span><span class="n">get_tokens_unprocessed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">):</span> | |
| <span class="k">if</span> <span class="n">token</span> <span class="ow">is</span> <span class="n">Name</span> <span class="ow">and</span> <span class="n">value</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">EXTRA_KEYWORDS</span><span class="p">:</span> | |
| <span class="k">yield</span> <span class="n">index</span><span class="p">,</span> <span class="n">Keyword</span><span class="o">.</span><span class="n">Pseudo</span><span class="p">,</span> <span class="n">value</span> | |
| <span class="k">else</span><span class="p">:</span> | |
| <span class="k">yield</span> <span class="n">index</span><span class="p">,</span> <span class="n">token</span><span class="p">,</span> <span class="n">value</span> | |
| </pre></div> | |
| <p>The <cite>PhpLexer</cite> and <cite>LuaLexer</cite> use this method to resolve builtin functions.</p> | |
| <p><strong>Note</strong> Do not confuse this with the <a class="reference external" href="./filters.html">filter</a> system.</p> | |
| </div> | |
| </div> | |
| </body> | |
| <!-- generated on: 2013-01-09 17:48:42.356609 | |
| file id: lexerdevelopment --> | |
| </html> |