In [4]:
import ply.lex as lex
__file__ = "notebook"

In [18]:
class MyLexer(object):
    # List of token names.   This is always required
    __file__ = ":class: MyLexer"
    
    states = (
        ("preprocessor", 'exclusive'),
    )

    
    tokens = (
        "PPDirective",
        "PPAngleQuote",
        "PPQuote",
        "PPLineComment",
        "PPBlockComment",
        "Identifier",
        "Float",
        "Integer",
        "Newline",
        "LB",
        "RB",
        "Comment"
    )

    # Regular expression rules for simple tokens
    def t_PPDirective(self, t):
        r"\#(include|define|undef|ifdef|ifndef|else|endif)"
        t.lexer.push_state("preprocessor")
        return t
    
    t_preprocessor_PPAngleQuote = r"<([^>\n]+)>"
    t_preprocessor_PPQuote = r'"([^"\n]+)"'
    t_preprocessor_PPLineComment = r"//.*"
    t_preprocessor_PPBlockComment = r"/\*[^\n]*\*/"
    t_ANY_Identifier = r"[a-zA-Z_][a-zA-Z0-9_]+"
    t_LB  = r'\['
    t_RB  = r'\]'
    t_Comment = r';.*'

    # A regular expression rule with some action code
    # Note addition of self parameter since we're in a class
    def t_ANY_Float(self, t):
        r"[-+]?(?:\d*\.\d+|\d+\.\d*)(?:e[+-]?\d+)?"
        t.value = float(t.value)
        return t

    def t_ANY_Integer(self, t):
        r'[-+]?\d+'
        t.value = int(t.value)    
        return t

    # Define a rule so we can track line numbers
    def t_preprocessor_Newline(self, t):
        r'\n'
        t.lexer.lineno += len(t.value)
        t.value = None
        t.lexer.pop_state()
        return t
        
    def t_Newline(self, t):
        r'\n'
        t.lexer.lineno += len(t.value)
        t.value = None
        return t

    # A string containing ignored characters (spaces and tabs)
    t_INITIAL_preprocessor_ignore  = ' \t'

    # Error handling rule
    def t_error(self, t):
        raise ValueError(f"Illegal character {t.value[0]} in statement {t.value.splitlines()[0]}")

    def t_preprocessor_error(self, t):
        raise ValueError(f"Illegal character {t.value[0]} in preprocessor statement {t.value.splitlines()[0]}")

    # Build the lexer
    def build(self, **kwargs):
        self.lexer = lex.lex(module=self, **kwargs)
    
    # Test it output
    def test(self,data):
        self.lexer.input(data)
        tokens = []
        while True:
            tok = self.lexer.token()
            if not tok: 
                break
            tokens.append(tok)
            
        return tokens

    def tokenize(self, data):
        pass


def build_lexer(lexer_cls):
    """
    """
    lexer = lexer_cls()
    #__file__ = lexer_cls.filename
    lexer.build()
    return lexer

In [19]:
l = build_lexer(MyLexer)

In [20]:
l.__file__

':class: MyLexer'

In [21]:
example = """\
; Header
#include <iostream> <top>
#include "topol.top" /* asdf *asdf */
#define A123 _a_1_23 
[ section ]
val _Avl .1 -1 ;// dfgh
"""

In [22]:
l.test(example)

[LexToken(Comment,'; Header',1,0),
 LexToken(Newline,None,1,8),
 LexToken(PPDirective,'#include',2,9),
 LexToken(PPAngleQuote,'<iostream>',2,18),
 LexToken(PPAngleQuote,'<top>',2,29),
 LexToken(Newline,None,2,34),
 LexToken(PPDirective,'#include',3,35),
 LexToken(PPQuote,'"topol.top"',3,44),
 LexToken(PPBlockComment,'/* asdf *asdf */',3,56),
 LexToken(Newline,None,3,72),
 LexToken(PPDirective,'#define',4,73),
 LexToken(Identifier,'A123',4,81),
 LexToken(Identifier,'_a_1_23',4,86),
 LexToken(Newline,None,4,94),
 LexToken(LB,'[',5,95),
 LexToken(Identifier,'section',5,97),
 LexToken(RB,']',5,105),
 LexToken(Newline,None,5,106),
 LexToken(Identifier,'val',6,107),
 LexToken(Identifier,'_Avl',6,111),
 LexToken(Float,0.1,6,116),
 LexToken(Integer,-1,6,119),
 LexToken(Comment,';// dfgh',6,122),
 LexToken(Newline,None,6,130)]