In [1]:
from sly import Lexer


class LangLexer(Lexer):

    # lexer

    # main tokens - ID, keywords, data types, some funcs, like TYPEOF

    tokens = {ID, INT, FLOAT, ASSIGN, STRING, LET,
              IF, ELSE, EQEQ, SEP, NOTEQ, LESS,
              GREATER, LESSEQ, GREATEREQ, NIL, WHILE,
              FOR, FN, RETURN, LAMBDA, ARROW, TRUE, FALSE,
              AND, OR, SHR, SHL, INC, DEC, PLUSASGN,
              MINUSASGN, STARASGN, SLASHASGN, MODULOASGN,
              ANDASGN, ORASGN, XORASGN, SHLASGN, SHRASGN,
              IMPORT, STRUCT, INT_TYPE, FLOAT_TYPE, BOOL_TYPE,
              LIST_TYPE, DICT_TYPE, STRING_TYPE, TYPEOF,
              LEFTARROW, PIPE, CLASS, DOUBLECOLON, BEGIN, END}

    # ignore tabs and comments

    ignore = ' \t'
    ignore_comment_slash = r'//.*'

    # one-symbol literals

    literals = {'=', '+', '-', '/', '*',
                '(', ')', ',', '{', '}',
                '%', '[', ']', '!', '&',
                '|', '^', '?', ':', '~',
                '.'}

    # assign of logical and assign operators

    INC = r'\+\+'
    DEC = r'--'
    PIPE = r'\|>'
    PLUSASGN = r'\+='
    MINUSASGN = r'-='
    STARASGN = r'\*='
    SLASHASGN = r'/='
    MODULOASGN = r'%='
    ANDASGN = r'&='
    ORASGN = r'\|='
    XORASGN = r'^='
    SHLASGN = r'<<='
    SHRASGN = r'>>='
    ARROW = r'=>'
    LESSEQ = r'<='
    GREATEREQ = r'>='
    LEFTARROW = r'<-'
    SHR = r'>>'
    SHL = r'<<'
    LESS = r'<'
    GREATER = r'>'
    NOTEQ = r'!='
    EQEQ = r'=='
    ASSIGN = r'='
    SEP = r';'
    DOUBLECOLON = r'::'

    # keywords

    ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
    ID['let'] = LET
    ID['if'] = IF
    ID['else'] = ELSE
    ID['nil'] = NIL
    ID['while'] = WHILE
    ID['for'] = FOR
    ID['fn'] = FN
    ID['return'] = RETURN
    ID['lambda'] = LAMBDA
    ID['true'] = TRUE
    ID['false'] = FALSE
    ID['and'] = AND
    ID['or'] = OR
    ID['import'] = IMPORT
    ID['struct'] = STRUCT
    ID['int'] = INT_TYPE
    ID['float'] = FLOAT_TYPE
    ID['string'] = STRING_TYPE
    ID['bool'] = BOOL_TYPE
    ID['list'] = LIST_TYPE
    ID['dict'] = DICT_TYPE
    ID['typeof'] = TYPEOF
    ID['class'] = CLASS
    
    # experiment funcs
    
    ID['begin'] = BEGIN
    ID['end'] = END

    @_(r'\d+\.\d+')
    def FLOAT(self, t):
        # float numbers
        t.value = float(t.value)
        return t

    @_(r'\d+')
    def INT(self, t):
        # integer numbers
        t.value = int(t.value)
        return t

    @_(r'\".*?(?<!\\)(\\\\)*\"')
    def STRING(self, t):
        # parsing the string type and cleaning it
        t.value = t.value[1:-1]
        t.value = t.value.replace(r"\n", "\n")
        t.value = t.value.replace(r"\t", "\t")
        t.value = t.value.replace(r"\\", "\\")
        t.value = t.value.replace(r"\"", "\"")
        t.value = t.value.replace(r"\a", "\a")
        t.value = t.value.replace(r"\b", "\b")
        t.value = t.value.replace(r"\r", "\r")
        t.value = t.value.replace(r"\t", "\t")
        t.value = t.value.replace(r"\v", "\v")
        return t

    @_(r'\n+')
    def ignore_newline(self, t):
        self.lineno += len(t.value)

    def error(self, t):
        print("Illegal character '%s' on line %d" % (t.value[0], self.lineno))
        self.index += 1

In [3]:
lex = LangLexer()

text = '''

x = 0;
while x < 10 | x >2 {
begin
x = x + 1;
end
}
'''
#text = lex.tokenize(text)

for tok in lex.tokenize(text):
        print(tok)

Token(type='ID', value='x', lineno=3, index=2)
Token(type='ASSIGN', value='=', lineno=3, index=4)
Token(type='INT', value=0, lineno=3, index=6)
Token(type='SEP', value=';', lineno=3, index=7)
Token(type='WHILE', value='while', lineno=4, index=9)
Token(type='ID', value='x', lineno=4, index=15)
Token(type='LESS', value='<', lineno=4, index=17)
Token(type='INT', value=10, lineno=4, index=19)
Token(type='|', value='|', lineno=4, index=22)
Token(type='ID', value='x', lineno=4, index=24)
Token(type='GREATER', value='>', lineno=4, index=26)
Token(type='INT', value=2, lineno=4, index=27)
Token(type='{', value='{', lineno=4, index=29)
Token(type='BEGIN', value='begin', lineno=5, index=31)
Token(type='ID', value='x', lineno=6, index=37)
Token(type='ASSIGN', value='=', lineno=6, index=39)
Token(type='ID', value='x', lineno=6, index=41)
Token(type='+', value='+', lineno=6, index=43)
Token(type='INT', value=1, lineno=6, index=45)
Token(type='SEP', value=';', lineno=6, index=46)
Token(type='END', va

In [18]:
lex.ID

'[a-zA-Z_][a-zA-Z0-9_]*'

In [9]:
from sly import Parser
#from lexer import LangLexer


class LangParser(Parser):

    # get tokens to parse
    tokens = LangLexer.tokens

    # file to debug
    debugfile = 'parser.out'

    # precedence of operations

    precedence = (
        ('right', PLUSASGN, MINUSASGN, STARASGN, SLASHASGN,
         MODULOASGN, ANDASGN, ORASGN, XORASGN, SHLASGN, SHRASGN),
        ('left', OR),
        ('left', AND),
        ('left', '|'),
        ('left', '^'),
        ('left', '&'),
        ('left', EQEQ, NOTEQ),
        ('left', LESS, LESSEQ, GREATER, GREATEREQ),
        ('left', SHL, SHR),
        ('left', '+', '-'),
        ('left', '*', '/', '%'),
        ('right', 'UMINUS', 'UPLUS', 'LOGICALNOT', INC, DEC),
        ('right', '!'),
    )

    # statements processing funcs

    @_('statements')
    def program(self, p):
        return p.statements

    @_('empty')
    def program(self, p):
        return ()

    @_('statement')
    def statements(self, p):
        return (p.statement, )

    @_('statements statement')
    def statements(self, p):
        return p.statements + (p.statement, )

    @_('import_statement')
    def statement(self, p):
        return p.import_statement

    @_('function_definition')
    def statement(self, p):
        return p.function_definition

    @_('return_statement')
    def statement(self, p):
        return p.return_statement

    @_('while_statement')
    def statement(self, p):
        return p.while_statement

    @_('for_statement')
    def statement(self, p):
        return p.for_statement

    @_('if_statement')
    def statement(self, p):
        return p.if_statement

    @_('struct_definition')
    def statement(self, p):
        return p.struct_definition

    @_('class_definition')
    def statement(self, p):
        return p.class_definition

    @_('CLASS ID "{" function_definitions "}"')
    def class_definition(self, p):
        return ('class', p.ID, ('functions', p.function_definitions))

    @_('empty')
    def function_definitions(self, p):
        return []

    @_('function_definition')
    def function_definitions(self, p):
        return [p.function_definition]

    @_('function_definitions function_definition')
    def function_definitions(self, p):
        return p.function_definitions + [p.function_definition]

    @_('STRUCT ID "{" struct_fields "}" SEP')
    def struct_definition(self, p):
        return ('struct', p.ID, p.struct_fields)

    @_('struct_field')
    def struct_fields(self, p):
        return p.struct_field

    @_('struct_fields struct_field')
    def struct_fields(self, p):
        return p.struct_fields + p.struct_field

    @_('LET ID ":" var_type SEP')
    def struct_field(self, p):
        return [(p.ID, p.var_type)]

    @_('INT_TYPE')
    def var_type(self, p):
        return 'int'

    @_('FLOAT_TYPE')
    def var_type(self, p):
        return 'float'

    @_('STRING_TYPE')
    def var_type(self, p):
        return 'string'

    @_('BOOL_TYPE')
    def var_type(self, p):
        return 'bool'

    @_('LIST_TYPE')
    def var_type(self, p):
        return 'list'

    @_('DICT_TYPE')
    def var_type(self, p):
        return 'dict'

    @_('IMPORT STRING SEP')
    def import_statement(self, p):
        return ('import', p.STRING)

    @_('var_define SEP')
    def statement(self, p):
        return p.var_define

    @_('FN ID "(" params ")" block')
    def function_definition(self, p):
        return ('fn', p.ID, ('params', p.params), ('block', p.block))

    @_('ID LEFTARROW "{" struct_init_exprs "}" ')
    def expr(self, p):
        return ('init_struct', p.ID, p.struct_init_exprs)

    @_('expr')
    def struct_init_exprs(self, p):
        return [p.expr]

    @_('struct_init_exprs "," expr')
    def struct_init_exprs(self, p):
        return p.struct_init_exprs + [p.expr]

    @_('LAMBDA "(" params ")" ARROW expr')
    def expr(self, p):
        return ('lambda', ('params', p.params), ('block', ('return', p.expr)))

    @_('LET var ASSIGN expr')
    def var_define(self, p):
        return ('var_define', p.var, p.expr)

    @_('LET getter ASSIGN expr')
    def var_define(self, p):
        return ('var_define', p.getter, p.expr)

    @_('LET var ":" var_type SEP')
    def statement(self, p):
        return ('var_define_no_expr', p.var, p.var_type)

    @_('TYPEOF expr')
    def expr(self, p):
        return ('typeof', p.expr)

    @_('var_assign SEP')
    def statement(self, p):
        return p.var_assign

    @_('RETURN expr SEP')
    def return_statement(self, p):
        return ('return', p.expr)

    @_('var ASSIGN expr')
    def var_assign(self, p):
        return ('var_assign', p.var, p.expr)

    @_('var PLUSASGN expr')
    def var_assign(self, p):
        return ('var_assign', p.var, ('+', ('var', p.var), p.expr))

    @_('var MINUSASGN expr')
    def var_assign(self, p):
        return ('var_assign', p.var, ('-', ('var', p.var), p.expr))

    @_('var STARASGN expr')
    def var_assign(self, p):
        return ('var_assign', p.var, ('*', ('var', p.var), p.expr))

    @_('var SLASHASGN expr')
    def var_assign(self, p):
        return ('var_assign', p.var, ('/', ('var', p.var), p.expr))

    @_('var MODULOASGN expr')
    def var_assign(self, p):
        return ('var_assign', p.var, ('%', ('var', p.var), p.expr))

    @_('var ANDASGN expr')
    def var_assign(self, p):
        return ('var_assign', p.var, ('&', ('var', p.var), p.expr))

    @_('var ORASGN expr')
    def var_assign(self, p):
        return ('var_assign', p.var, ('|', ('var', p.var), p.expr))

    @_('var XORASGN expr')
    def var_assign(self, p):
        return ('var_assign', p.var, ('^', ('var', p.var), p.expr))

    @_('var SHLASGN expr')
    def var_assign(self, p):
        return ('var_assign', p.var, ('<<', ('var', p.var), p.expr))

    @_('var SHRASGN expr')
    def var_assign(self, p):
        return ('var_assign', p.var, ('>>', ('var', p.var), p.expr))

    @_('IF expr block ELSE block')
    def if_statement(self, p):
        return ('if', ('condition', p.expr), ('block', p.block0), ('block', p.block1))

    @_('IF expr block')
    def if_statement(self, p):
        return ('if', ('condition', p.expr), ('block', p.block), None)

    @_('WHILE expr block')
    def while_statement(self, p):
        return ('while', ('condition', p.expr), ('block', p.block))

    @_('FOR var_assign SEP expr SEP var_assign block')
    def for_statement(self, p):
        return (p.var_assign0, ('while', ('condition', p.expr), ('block', p.block + (p.var_assign1, ))))

    @_('expr "?" expr ":" expr')
    def expr(self, p):
        return ('?:', p.expr0, p.expr1, p.expr2)

    @_('ID "(" args ")"')
    def expr(self, p):
        return ('call', p.ID, ('args', p.args))

    @_('ID DOUBLECOLON ID "(" args ")"')
    def expr(self, p):
        return ('class_func_call', p.ID0, p.ID1, ('args', p.args))

    @_('expr PIPE ID "(" args ")"')
    def expr(self, p):
        return ('call', p.ID, ('args', [p.expr] + p.args))

    @_('expr EQEQ expr')
    def expr(self, p):
        return ('==', p.expr0, p.expr1)

    @_('expr NOTEQ expr')
    def expr(self, p):
        return ('!=', p.expr0, p.expr1)

    @_('expr LESSEQ expr')
    def expr(self, p):
        return ('<=', p.expr0, p.expr1)

    @_('expr GREATEREQ expr')
    def expr(self, p):
        return ('>=', p.expr0, p.expr1)

    @_('expr AND expr')
    def expr(self, p):
        return ('and', p.expr0, p.expr1)

    @_('expr OR expr')
    def expr(self, p):
        return ('or', p.expr0, p.expr1)

    @_('expr LESS expr')
    def expr(self, p):
        return ('<', p.expr0, p.expr1)

    @_('expr GREATER expr')
    def expr(self, p):
        return ('>', p.expr0, p.expr1)

    @_('expr SHL expr')
    def expr(self, p):
        return ('<<', p.expr0, p.expr1)

    @_('expr SHR expr')
    def expr(self, p):
        return ('>>', p.expr0, p.expr1)

    @_('expr "&" expr')
    def expr(self, p):
        return ('&', p.expr0, p.expr1)

    @_('expr "^" expr')
    def expr(self, p):
        return ('^', p.expr0, p.expr1)

    @_('expr "|" expr')
    def expr(self, p):
        return ('|', p.expr0, p.expr1)

    @_('"~" expr %prec LOGICALNOT')
    def expr(self, p):
        return ('~', p.expr)

    @_('expr SEP')
    def statement(self, p):
        return p.expr

    @_('"-" expr %prec UMINUS')
    def expr(self, p):
        return ('neg', p.expr)

    @_('"+" expr %prec UPLUS')
    def expr(self, p):
        return p.expr

    @_('"!" expr')
    def expr(self, p):
        return ('!', p.expr)

    @_('INC var')
    def var_assign(self, p):
        return ('var_assign', p.var, ('+', ('var', p.var), 1))

    @_('DEC var')
    def var_assign(self, p):
        return ('var_assign', p.var, ('-', ('var', p.var), 1))

    @_('expr "+" expr')
    def expr(self, p):
        return ('+', p.expr0, p.expr1)

    @_('expr "-" expr')
    def expr(self, p):
        return ('-', p.expr0, p.expr1)

    @_('expr "*" expr')
    def expr(self, p):
        return ('*', p.expr0, p.expr1)

    @_('expr "/" expr')
    def expr(self, p):
        return ('/', p.expr0, p.expr1)

    @_('expr "%" expr')
    def expr(self, p):
        return ('%', p.expr0, p.expr1)

    @_('"(" expr ")"')
    def expr(self, p):
        return p.expr

    @_('INT')
    def expr(self, p):
        return p.INT

    @_('FLOAT')
    def expr(self, p):
        return p.FLOAT

    @_('STRING')
    def expr(self, p):
        return p.STRING

    @_('TRUE')
    def expr(self, p):
        return True

    @_('FALSE')
    def expr(self, p):
        return False

    @_('list_val')
    def expr(self, p):
        return p.list_val

    @_('"[" exprs "]"')
    def list_val(self, p):
        return p.exprs

    @_('empty')
    def exprs(self, p):
        return []

    @_('expr')
    def exprs(self, p):
        return [p.expr]

    @_('exprs "," expr')
    def exprs(self, p):
        return p.exprs + [p.expr]

    @_('var "[" expr "]"')
    def expr(self, p):
        return ('indexing', ('var', p.var), p.expr)

    @_('var')
    def expr(self, p):
        return ('var', p.var)

    @_('ID')
    def var(self, p):
        return p.ID

    @_('var "[" expr "]"')
    def var(self, p):
        return ('indexing', ('var', p.var), p.expr)

    @_('NIL')
    def expr(self, p):
        return None

    @_('')
    def empty(self, p):
        pass

    @_('"{" program "}"')
    def block(self, p):
        return p.program

    @_('statement')
    def block(self, p):
        return (p.statement, )

    @_('params "," param')
    def params(self, p):
        return p.params + [p.param]

    @_('param')
    def params(self, p):
        return [p.param]

    @_('empty')
    def params(self, p):
        return []

    @_('ID ":" var_type')
    def param(self, p):
        return (p.ID, p.var_type)

    @_('args "," arg')
    def args(self, p):
        return p.args + [p.arg]

    @_('arg')
    def args(self, p):
        return [p.arg]

    @_('empty')
    def args(self, p):
        return []

    @_('expr')
    def arg(self, p):
        return p.expr

    @_('"{" member_list "}"')
    def expr(self, p):
        return p.member_list

    @_('empty')
    def member_list(self, p):
        return {}

    @_('member')
    def member_list(self, p):
        return p.member

    @_('member_list "," member')
    def member_list(self, p):
        return {**p.member_list, **p.member}

    @_('STRING ":" expr')
    def member(self, p):
        return {p.STRING: p.expr}

    @_('getter "." ID')
    def getter(self, p):
        return ('.', p.getter, p.ID)

    @_('ID')
    def getter(self, p):
        return p.ID

    @_('getter')
    def expr(self, p):
        return p.getter
    
    @_('begin_statement')
    def statement(self, p):
        return p.begin_statement
    
    @_('end_statement')
    def statement(self, p):
        return p.end_statement
    
    @_('BEGIN')
    def begin_statement(self, p):
        return 'begin'
    
    @_('END')
    def end_statement(self, p):
        return 'end'

Parser debugging for LangParser written to parser.out


In [15]:
text = 'begin'
parse_ = LangParser()
parse_.parse(lex.tokenize(text))

('begin',)


('begin',)

In [9]:
string = 'a = [i for i = 0; i < 10; ++i]'
text = lex.tokenize(string)

for tok in lex.tokenize(string):
    print(tok)

Token(type='ID', value='a', lineno=1, index=0)
Token(type='ASSIGN', value='=', lineno=1, index=2)
Token(type='[', value='[', lineno=1, index=4)
Token(type='ID', value='i', lineno=1, index=5)
Token(type='FOR', value='for', lineno=1, index=7)
Token(type='ID', value='i', lineno=1, index=11)
Token(type='ASSIGN', value='=', lineno=1, index=13)
Token(type='INT', value=0, lineno=1, index=15)
Token(type='SEP', value=';', lineno=1, index=16)
Token(type='ID', value='i', lineno=1, index=18)
Token(type='LESS', value='<', lineno=1, index=20)
Token(type='INT', value=10, lineno=1, index=22)
Token(type='SEP', value=';', lineno=1, index=24)
Token(type='INC', value='++', lineno=1, index=26)
Token(type='ID', value='i', lineno=1, index=28)
Token(type=']', value=']', lineno=1, index=29)


In [None]:
"ID ASSIGN [ ID FOR ID ASSIGN statement SEP ID LESS statement SEP INC ID ]" --> ""