In [190]:
import os
import re
from typing import NamedTuple

In [207]:
with open("./sample.mb", "r") as f:
    sample = f.read()
print(sample)

PRINT "HELLO THERE!";
LET A = 10;
LET B = 5;
LET C = -3;
LET Z = (A*B) + C;
PRINT "ANSWER IS {Z}";



In [192]:
class Token(NamedTuple):
    type: str
    value: str

In [204]:
keywords = ["LET", "PRINT", "AND", "OR", "NOT", "IF", "ELSE", "GOTO"]
separator = ['[', ']', '(', ')', '{', '}', ';', ',', '"']
operator = ['+', '-', '*', '/', '%', '=', '!=', '>', '<', '==', '<=', '>=']
tokens = []
variables = set()
count = 0
lines = sample.split(';')[:-1]

def parse_string(string):
    tokens.append(Token("LITERAL", string[1:-1]))

def parse_expression(expression):
    for e in expression:
        if e in separator:
            tokens.append(Token("SEPARATOR", e))
        elif e in operator:
            tokens.append(Token("OPERATOR", e))
        elif (e.isalpha()) and (e in variables):
            tokens.append(Token('IDENTIFIER', e))
        elif e.lstrip('-').isnumeric():
            tokens.append(Token("LITERAL", e))
        elif len(e.split('.')) == 2:
            if (e.split('.')[0].lstrip('-').isnumeric()) and (e.split('.')[1].lstrip('-').isnumeric()):
                tokens.append(Token("LITERAL", e))
        else:
            if (e.isalpha()) and not(e in variables):
                raise Exception(f"Variable {e} not defined")
            else:
                raise Exception("Something wrong in the expression")


def parse_print(string):
    assert string[0] == string[-1] == '"', "Error in PRINT. Missing \""
    if not(('{' in string) or ('}' in string)):
        parse_string(string)
    else:
        # strings with vars in a PRINT statement
        string = string[1:-1]
        start = 0
        for i, char in enumerate(string):
            if char == "{":
                if len(string[start:i]):
                    tokens.append(Token("LITERAL", string[start:i]))
                start = i + 1
                tokens.append(Token("SEPARATOR", "{"))
            elif char == "}":
                parse_expression(string[start:i])
                tokens.append(Token("SEPARATOR", "}"))
                start = i + 1
        
def parse_let(expression):
    expression = expression.split('=')
    assert expression[0].isalpha(), "Invalid Variable name"
    if expression[0].isalpha():
        variables.add(expression[0])
        tokens.append(Token('IDENTIFIER', expression[0]))
    if expression[-1].lstrip('-').isnumeric():
        tokens.append(Token("LITERAL", expression[-1]))
    else:
        try:
            if float(expression[-1]):
                tokens.append(Token("LITERAL", expression[-1]))
        except ValueError:
            parse_expression(expression[-1])
            #tokens.append(Token("EXPRESSION", expression[-1]))   

for line in lines:
    line = line.replace('\n', '')
    chunks = line.split(' ')
    if chunks[0] == "PRINT":
        tokens.append(Token("KEYWORD", chunks[0]))
        parse_print(' '.join(chunks[1:]))
    if chunks[0] == "LET":
        tokens.append(Token("KEYWORD", chunks[0]))
        parse_let(''.join(chunks[1:]))
    tokens.append(Token("SEPARATOR", ';'))

In [205]:
tokens

[Token(type='KEYWORD', value='PRINT'),
 Token(type='LITERAL', value='HELLO THERE!'),
 Token(type='SEPARATOR', value=';'),
 Token(type='KEYWORD', value='LET'),
 Token(type='IDENTIFIER', value='A'),
 Token(type='LITERAL', value='10'),
 Token(type='SEPARATOR', value=';'),
 Token(type='KEYWORD', value='LET'),
 Token(type='IDENTIFIER', value='B'),
 Token(type='LITERAL', value='5'),
 Token(type='SEPARATOR', value=';'),
 Token(type='KEYWORD', value='LET'),
 Token(type='IDENTIFIER', value='C'),
 Token(type='LITERAL', value='-3'),
 Token(type='SEPARATOR', value=';'),
 Token(type='KEYWORD', value='LET'),
 Token(type='IDENTIFIER', value='Z'),
 Token(type='SEPARATOR', value='('),
 Token(type='IDENTIFIER', value='A'),
 Token(type='OPERATOR', value='*'),
 Token(type='IDENTIFIER', value='B'),
 Token(type='SEPARATOR', value=')'),
 Token(type='OPERATOR', value='+'),
 Token(type='IDENTIFIER', value='C'),
 Token(type='SEPARATOR', value=';'),
 Token(type='KEYWORD', value='PRINT'),
 Token(type='LITERAL', v

To tokenize and build an Abstract Syntax Tree (AST) for the code snippet:

css
Copy code
LET A = 10;
LET B = 5;
LET C = 2;
LET Z = (A*B)+C
PRINT Z
Step 1: Tokenization
Token Definitions
First, define the tokens for the keywords, identifiers, operators, numbers, and punctuation.

Keywords: LET, PRINT
Identifiers: A, B, C, Z
Operators: =, *, +
Numbers: 10, 5, 2
Punctuation: (, ), ;
Tokenization Process
Here's how you might tokenize the input:

plaintext
Copy code
[
    Token('KEYWORD', 'LET'),
    Token('IDENTIFIER', 'A'),
    Token('OPERATOR', '='),
    Token('NUMBER', '10'),
    Token('PUNCTUATION', ';'),

    Token('KEYWORD', 'LET'),
    Token('IDENTIFIER', 'B'),
    Token('OPERATOR', '='),
    Token('NUMBER', '5'),
    Token('PUNCTUATION', ';'),

    Token('KEYWORD', 'LET'),
    Token('IDENTIFIER', 'C'),
    Token('OPERATOR', '='),
    Token('NUMBER', '2'),
    Token('PUNCTUATION', ';'),

    Token('KEYWORD', 'LET'),
    Token('IDENTIFIER', 'Z'),
    Token('OPERATOR', '='),
    Token('PUNCTUATION', '('),
    Token('IDENTIFIER', 'A'),
    Token('OPERATOR', '*'),
    Token('IDENTIFIER', 'B'),
    Token('OPERATOR', '+'),
    Token('IDENTIFIER', 'C'),
    Token('PUNCTUATION', ')'),

    Token('KEYWORD', 'PRINT'),
    Token('IDENTIFIER', 'Z')
]
Step 2: Building the AST
AST Structure
Now, we’ll build the AST for each statement.

LET Statements: Each assignment will be an ASSIGNMENT node with a VARIABLE and a VALUE.
Expression for Z: The expression Z = (A * B) + C will be represented as a tree that captures the order of operations.
PRINT Statement: The print statement will be represented with a PRINT node that has the variable Z.
AST Construction
Here’s how the AST might look for each part:

For LET A = 10:
plaintext
Copy code
ASTNode(ASSIGNMENT, [
    ASTNode(VARIABLE, ['A']),
    ASTNode(VALUE, ['10'])
])
For LET B = 5:
plaintext
Copy code
ASTNode(ASSIGNMENT, [
    ASTNode(VARIABLE, ['B']),
    ASTNode(VALUE, ['5'])
])
For LET C = 2:
plaintext
Copy code
ASTNode(ASSIGNMENT, [
    ASTNode(VARIABLE, ['C']),
    ASTNode(VALUE, ['2'])
])
For LET Z = (A * B) + C: The expression involves parentheses, multiplication, and addition. The AST will look like this:
plaintext
Copy code
ASTNode(ASSIGNMENT, [
    ASTNode(VARIABLE, ['Z']),
    ASTNode(EXPRESSION, [
        ASTNode(OPERATOR, ['+']),
        ASTNode(EXPRESSION, [
            ASTNode(VARIABLE, ['A']),
            ASTNode(OPERATOR, ['*']),
            ASTNode(VARIABLE, ['B'])
        ]),
        ASTNode(VARIABLE, ['C'])
    ])
])
For PRINT Z:
plaintext
Copy code
ASTNode(PRINT, [
    ASTNode(VARIABLE, ['Z'])
])
Complete AST Representation
Putting it all together, the complete AST might look something like this:

plaintext
Copy code
ASTNode(PROGRAM, [
    ASTNode(ASSIGNMENT, [
        ASTNode(VARIABLE, ['A']),
        ASTNode(VALUE, ['10'])
    ]),
    ASTNode(ASSIGNMENT, [
        ASTNode(VARIABLE, ['B']),
        ASTNode(VALUE, ['5'])
    ]),
    ASTNode(ASSIGNMENT, [
        ASTNode(VARIABLE, ['C']),
        ASTNode(VALUE, ['2'])
    ]),
    ASTNode(ASSIGNMENT, [
        ASTNode(VARIABLE, ['Z']),
        ASTNode(EXPRESSION, [
            ASTNode(OPERATOR, ['+']),
            ASTNode(EXPRESSION, [
                ASTNode(VARIABLE, ['A']),
                ASTNode(OPERATOR, ['*']),
                ASTNode(VARIABLE, ['B'])
            ]),
            ASTNode(VARIABLE, ['C'])
        ])
    ]),
    ASTNode(PRINT, [
        ASTNode(VARIABLE, ['Z'])
    ])
])
Conclusion
This structure captures the relationships and hierarchy of the program’s components, allowing for easier semantic analysis, optimization, and code generation in subsequent phases of processing. You can expand upon this design as needed for more complex features or additional constructs in your language.