-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from nyjc-computing/scanner
01 Scanning
- Loading branch information
Showing
1 changed file
with
91 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# Helper functions | ||
|
||
def atEnd(code): | ||
return (len(code['src']) == 0) | ||
|
||
def check(code): | ||
return code['src'][0] | ||
|
||
def consume(code): | ||
char = check(code) | ||
code['src'] = code['src'][1:] # Remove first char | ||
return char | ||
|
||
# Scanning functions | ||
|
||
def word(code): | ||
# We know the first character is an alphabet letter | ||
# because we checked in main() | ||
token = consume(code) | ||
while not atEnd(code) and check(code).isalpha(): | ||
# Keep adding letters to token | ||
# and removing from src | ||
token += consume(code) | ||
return token | ||
|
||
def integer(code): | ||
# Starting with digits | ||
token = consume(code) | ||
while not atEnd(code) and check(code).isdigit(): | ||
# Keep adding digits to token | ||
# and removing from src | ||
token += consume(code) | ||
return token | ||
|
||
def string(code): | ||
token = consume(code) | ||
# Stop at next double-quote (") | ||
while not atEnd(code) and check(code) != '"': | ||
# Keep adding letters to token | ||
# and removing from src | ||
token += consume(code) | ||
# Remember to consume the ending double-quote '"' | ||
if not atEnd(code): | ||
token += consume(code) | ||
return token | ||
|
||
def symbol(code): | ||
token = consume(code) | ||
if token in '()[]': # single-character tokens | ||
return token | ||
# Check if the next character is a valid symbol | ||
# that forms part of a multi-character symbol. | ||
while not atEnd(code) and (check(code) in ':.+-/*=<>'): | ||
token += consume(code) | ||
return token | ||
|
||
|
||
|
||
# Main scanning loop | ||
|
||
def scan(src): | ||
code = {'src': src} | ||
tokens = [] | ||
while not atEnd(code): | ||
char = check(code) | ||
# If it is whitespace, ignore it. | ||
if char in [' ', '\r', '\t']: | ||
consume(code) | ||
continue | ||
# Line break | ||
elif char == '\n': | ||
token = consume(code) | ||
# Tokenise words | ||
elif char.isalpha(): | ||
token = word(code) | ||
# Tokenise integers | ||
elif char.isdigit(): | ||
token = integer(code) | ||
# Tokenise strings | ||
elif char == '"': | ||
token = string(code) | ||
# Tokenise symbols | ||
elif char in '()[]:.+-/*=<>': | ||
token = symbol(code) | ||
else: | ||
# We want an internal representation of the character for | ||
# error reporting, and we get that using the repr() function | ||
raise ValueError(f"Unrecognised character {repr(char)}.") | ||
tokens += [token] | ||
print('Scanned token:', token, ', characters left:', len(code['src'])) | ||
return tokens |