-
Notifications
You must be signed in to change notification settings - Fork 2
/
indent.py
executable file
·173 lines (134 loc) · 3.9 KB
/
indent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/python
"""
An indentation based array parser written in PLY (Python Lex and Yacc)
Author: Matt Donahoe
2012-12-09
I was curious how lex would handle the "off-side rule".
I had to create a two stage lexer:
The first is the regular lexer in lex.py
The second has a buffer of tokens and a stack of indentation levels.
The indentation logic is based off of Python's
http://docs.python.org/2/reference/lexical_analysis.html#indentation
OTHER RESOURCES
Lexing Python
http://erezsh.wordpress.com/2008/07/12/python-parsing-1-lexing/
Off-side Rule
http://en.wikipedia.org/wiki/Off-side_rule
See GardenSnake.py in PLY's examples for a more complete version I found.
"""
from ply import lex, yacc
from sys import stdin, argv
import copy
# FIRST LEXING STAGE
tokens = (
'STRING',
'WHITESPACE',
)
t_STRING = r'[^ \n][^\n]*'
t_WHITESPACE = r'\n[ ]*'
# empty lines dont affect indentation
t_ignore = r'\n[ ]*\n'
def t_error(t):
print 'error! %s' % t
# create our first stage
lexer = lex.lex()
# SECOND STAGE
class IndentLexer(object):
"""
A second lexing stage that interprets WHITESPACE
Manages Off-Side Rule for indentation
"""
def __init__(self, lexer):
self.indents = [0] # indentation stack
self.tokens = [] # token queue
self.lexer = lexer
def input(self, *args, **kwds):
self.lexer.input(*args, **kwds)
# Iterator interface
def __iter__(self):
return self
def next(self):
t = self.token()
if t is None:
raise StopIteration
return t
__next__ = next
def token(self):
# empty our buffer first
if self.tokens:
return self.tokens.pop(0)
# loop until we find a valid token
while 1:
# grab the next from first stage
token = self.lexer.token()
# we only care about whitespace
if not token or token.type != 'WHITESPACE':
return token
# check for new indent/dedent
whitespace = token.value[1:] # strip \n
change = self._calc_indent(whitespace)
if change:
break
# indentation change
if change == 1:
token.type = 'INDENT'
return token
# dedenting one or more times
assert change < 0
change += 1
token.type = 'DEDENT'
# buffer any additional DEDENTs
while change:
self.tokens.append(copy.copy(token))
change += 1
return token
def _calc_indent(self, whitespace):
"returns a number representing indents added or removed"
n = len(whitespace) # number of spaces
indents = self.indents # stack of space numbers
if n > indents[-1]:
indents.append(n)
return 1
# we are at the same level
if n == indents[-1]:
return 0
# dedent one or more times
i = 0
while n < indents[-1]:
indents.pop()
if n > indents[-1]:
raise SyntaxError("wrong indentation level")
i -= 1
return i
# create the second stage
lexer = IndentLexer(lexer)
# PARSING STAGE
def p_array(p):
"""array : INDENT elements DEDENT"""
p[0] = p[2]
def p_elements(p):
"elements : elements element"
p[0] = p[1] + [p[2]]
def p_elements_one(p):
"elements : element"
p[0] = [p[1]]
def p_element(p):
"""element : STRING
| array"""
p[0] = p[1]
def p_error(p):
print 'p error %s' % p
if __name__ == '__main__':
from sys import argv, stdin
x = stdin.read()
if len(argv) > 1 and argv[1] == 'lex':
# lex only
lexer.input(x)
for t in lexer:
if t.type == 'STRING':
print 'STRING:' + t.value
else:
print t.type
else:
# lex and parse
print yacc.yacc().parse(x, lexer)