Permalink
Cannot retrieve contributors at this time
Fetching contributors…
| # -*- coding: utf-8 -*- | |
| """ | |
| pygments.lexers.special | |
| ~~~~~~~~~~~~~~~~~~~~~~~ | |
| Special lexers. | |
| :copyright: Copyright 2006-2013 by the Pygments team, see AUTHORS. | |
| :license: BSD, see LICENSE for details. | |
| """ | |
| import re | |
| import cStringIO | |
| from pygments.lexer import Lexer | |
| from pygments.token import Token, Error, Text | |
| from pygments.util import get_choice_opt, b | |
| __all__ = ['TextLexer', 'RawTokenLexer'] | |
| class TextLexer(Lexer): | |
| """ | |
| "Null" lexer, doesn't highlight anything. | |
| """ | |
| name = 'Text only' | |
| aliases = ['text'] | |
| filenames = ['*.txt'] | |
| mimetypes = ['text/plain'] | |
| def get_tokens_unprocessed(self, text): | |
| yield 0, Text, text | |
| _ttype_cache = {} | |
| line_re = re.compile(b('.*?\n')) | |
| class RawTokenLexer(Lexer): | |
| """ | |
| Recreate a token stream formatted with the `RawTokenFormatter`. This | |
| lexer raises exceptions during parsing if the token stream in the | |
| file is malformed. | |
| Additional options accepted: | |
| `compress` | |
| If set to ``"gz"`` or ``"bz2"``, decompress the token stream with | |
| the given compression algorithm before lexing (default: ``""``). | |
| """ | |
| name = 'Raw token data' | |
| aliases = ['raw'] | |
| filenames = [] | |
| mimetypes = ['application/x-pygments-tokens'] | |
| def __init__(self, **options): | |
| self.compress = get_choice_opt(options, 'compress', | |
| ['', 'none', 'gz', 'bz2'], '') | |
| Lexer.__init__(self, **options) | |
| def get_tokens(self, text): | |
| if isinstance(text, unicode): | |
| # raw token stream never has any non-ASCII characters | |
| text = text.encode('ascii') | |
| if self.compress == 'gz': | |
| import gzip | |
| gzipfile = gzip.GzipFile('', 'rb', 9, cStringIO.StringIO(text)) | |
| text = gzipfile.read() | |
| elif self.compress == 'bz2': | |
| import bz2 | |
| text = bz2.decompress(text) | |
| # do not call Lexer.get_tokens() because we do not want Unicode | |
| # decoding to occur, and stripping is not optional. | |
| text = text.strip(b('\n')) + b('\n') | |
| for i, t, v in self.get_tokens_unprocessed(text): | |
| yield t, v | |
| def get_tokens_unprocessed(self, text): | |
| length = 0 | |
| for match in line_re.finditer(text): | |
| try: | |
| ttypestr, val = match.group().split(b('\t'), 1) | |
| except ValueError: | |
| val = match.group().decode(self.encoding) | |
| ttype = Error | |
| else: | |
| ttype = _ttype_cache.get(ttypestr) | |
| if not ttype: | |
| ttype = Token | |
| ttypes = ttypestr.split('.')[1:] | |
| for ttype_ in ttypes: | |
| if not ttype_ or not ttype_[0].isupper(): | |
| raise ValueError('malformed token name') | |
| ttype = getattr(ttype, ttype_) | |
| _ttype_cache[ttypestr] = ttype | |
| val = val[2:-2].decode('unicode-escape') | |
| yield length, ttype, val | |
| length += len(val) |