# Lexical Analyzer
1. the first phase of compiler
2. reads the input characters of source program
3. group them into lexemes
4. produce as output a schema of tokens for each lexeme

### working with a block of code

In [30]:
import re
import sys
class Lexer():
    _c99_desc = [
        # Comments
        ("\/\*.*?\*\/",     "COMMENT"),
        ("\/\/.*$",         "CPP_COMMENT"),
        ("\/\*.*?$",        "MULTILINE_COMMENT_START"),
        ("^.*?\*\/",        "MULTILINE_COMMENT_END"),
        # 3-character operators
        ("\<\<\=",          "OP_BITWISE_LSHIFT_ASSIGN"),
        ("\>\>\=",          "OP_BITWISE_RSHIFT_ASSIGN"),
        # 2-character operators
        ("\=\=",            "OP_EQUALS"),
        ("\!\=",            "OP_NOT_EQUALS"),
        ("\+\=",            "OP_ADD_ASSIGN"),
        ("\-\=",            "OP_SUB_ASSIGN"),
        ("\*\=",            "OP_MUL_ASSIGN"),
        ("\/\=",            "OP_DIV_ASSIGN"),
        ("\%\=",            "OP_MOD_ASSIGN"),
        ("\&\=",            "OP_AND_ASSIGN"),
        ("\|\=",            "OP_OR_ASSIGN"),
        ("\^\=",            "OP_BITWISE_XOR_ASSIGN"),
        ("\>\=",            "OP_GT_EQUAL"),
        ("\<\=",            "OP_LT_EQUAL"),
        ("\-\>",            "OP_STRUCT_DEREF"),
        ("\<\<",            "OP_BITWISE_LSHIFT"),
        ("\>\>",            "OP_BITWISE_RSHIFT"),
        ("\&\&",            "OP_LOGICAL_AND"),
        ("\|\|",            "OP_LOGICAL_OR"),
        ("\+\+",            "OP_INC"),
        ("\-\-",            "OP_DEC"),
        # single-character operators and symbols
        ("\{",              "SYM_LCB"),
        ("\}",              "SYM_RCB"),
        ("\(",              "SYM_LP"),
        ("\)",              "SYM_RP"),
        ("\[",              "SYM_LB"),
        ("\]",              "SYM_RB"),
        ("\&",              "SYM_AMPERSAND"),
        ("\|",              "OP_BITWISE_OR"),
        ("\!",              "OP_LOGICAL_NOT"),
        ("\~",              "OP_BITWISE_NOT"),
        ("\^",              "OP_BITWISE_XOR"),
        ("\.",              "OP_STRUCT_REF"),
        ("\?",              "OP_TERNARY"),
        ("\:",              "SYM_COLON"),
        ("\;",              "SYM_SEMICOLON"),
        ("\,",              "SYM_COMMA"),
        ("\=",              "OP_ASSIGN"),
        ("\>",              "OP_GT"),
        ("\<",              "OP_LT"),
        ("\+",              "SYM_PLUS"),
        ("\-",              "SYM_MINUS"),
        ("\*",              "SYM_STAR"),
        ("\/",              "OP_DIV"),
        ("\%",              "OP_MOD"),
        # C99 keywords
        ("auto",            "KW_AUTO"),
        ("break",           "KW_BREAK"),
        ("case",            "KW_CASE"),
        ("char",            "KW_CHAR"),
        ("const",           "KW_CONST"),
        ("continue",        "KW_CONTINUE"),
        ("default",         "KW_DEFAULT"),
        ("do",              "KW_DO"),
        ("double",          "KW_DOUBLE"),
        ("else",            "KW_ELSE"),
        ("enum",            "KW_ENUM"),
        ("extern",          "KW_EXTERN"),
        ("float",           "KW_FLOAT"),
        ("for",             "KW_FOR"),
        ("goto",            "KW_GOTO"),
        ("if",              "KW_IF"),
        ("inline",          "KW_INLINE"),
        ("int",             "KW_INT"),
        ("long",            "KW_LONG"),
        ("register",        "KW_REGISTER"),
        ("restrict",        "KW_RESTRICT"),
        ("return",          "KW_RETURN"),
        ("short",           "KW_SHORT"),
        ("signed",          "KW_SIGNED"),
        ("sizeof",          "KW_SIZEOF"),
        ("static",          "KW_STATIC"),
        ("struct",          "KW_STRUCT"),
        ("switch",          "KW_SWITCH"),
        ("typedef",         "KW_TYPEDEF"),
        ("union",           "KW_UNION"),
        ("unsigned",        "KW_UNSIGNED"),
        ("void",            "KW_VOID"),
        ("volatile",        "KW_VOLATILE"),
        ("while",           "KW_WHILE"),
        ("_Bool",           "KW_BOOL"),
        ("_Complex",        "KW_COMPLEX"),
        ("_Imaginary",      "KW_IMAGINARY"),
        # Labels
        ("[a-zA-Z_]+\w*?\:","LABEL"),
        # Macros
        ("#define\s.*?$",   "M_DEFINE"),
        ("#undef\s.*?$",    "M_UNDEF"),
        ("#include\s.*?$",  "M_INCLUDE"),
        ("#ifdef\s.*?$",    "M_IFDEF"),
        ("#ifndef\s.*?$",   "M_IFNDEF"),
        ("#endif\s.*?$",    "M_ENDIF"),
        ("#if\s.*?$",       "M_IF"),
        ("#elif\s.*?$",     "M_ELIF"),
        # Constants
        ("\d+",             "NUMERIC_CONSTANT"),
        ("[\'\"].*?[\'\"]", "STRING_CONSTANT"),
        # Identifiers
        ("[a-zA-Z0-9_]+",   "IDENTIFIER"),
        # Whitespace characters (space, tab, newline)
        ("\s+",             "WHITESPACE"),
    ]

    def __init__(self, keep_whitespaces=False):
        # define varible for whitespace to decide to scape it or not
        self._keep_ws = keep_whitespaces

        re_list = []
        for e in self._c99_desc:
            #joining more than one regix into one veriable to make matching later
            re_list.append("(?P<%s>%s)" % (e[1], e[0]))
        self.re_tokenizer = re.compile("|".join(re_list))


    def tokenize(self, line):
        position = 0
        tokens = []
        # go index by index on one line in code
        while (position <= len(line)):
            #match if letter or word agreed with regex or not
            r = self.re_tokenizer.match(line, position)
            if r:
                if r.lastgroup == "WHITESPACE" and not self._keep_ws:
                    pass
                else:
                    tokens.append((r.lastgroup, r.group(r.lastgroup)))
                position = r.end()
            else:
                break

        return tokens

In [31]:
file = open("Code.txt", "r")
line=file.read()
c=Lexer()
c.tokenize(line)

[('KW_INT', 'int'),
 ('IDENTIFIER', 'main'),
 ('SYM_LP', '('),
 ('SYM_RP', ')'),
 ('SYM_LCB', '{'),
 ('KW_INT', 'int'),
 ('IDENTIFIER', 'number'),
 ('SYM_SEMICOLON', ';'),
 ('IDENTIFIER', 'cout'),
 ('OP_BITWISE_LSHIFT', '<<'),
 ('STRING_CONSTANT', '"Enter an integer: "'),
 ('SYM_SEMICOLON', ';'),
 ('IDENTIFIER', 'cin'),
 ('OP_BITWISE_RSHIFT', '>>'),
 ('IDENTIFIER', 'number'),
 ('SYM_SEMICOLON', ';'),
 ('IDENTIFIER', 'cout'),
 ('OP_BITWISE_LSHIFT', '<<'),
 ('STRING_CONSTANT', '"You entered "'),
 ('OP_BITWISE_LSHIFT', '<<'),
 ('IDENTIFIER', 'number'),
 ('SYM_SEMICOLON', ';'),
 ('KW_RETURN', 'return'),
 ('NUMERIC_CONSTANT', '0'),
 ('SYM_SEMICOLON', ';'),
 ('SYM_RCB', '}'),
 ('COMMENT', '/* hello */'),
 ('CPP_COMMENT', '//nora')]

NTokenizers divide strings into lists of substrings. For example, tokenizers can be used to find the words and punctuation in a string:

### working with a statement of code

In [20]:
# imports various regex packages and libraries to be used
import re

import nltk

#asks for user input
input_program = input("Enter Your Code: ")
input_program_tokens = nltk.wordpunct_tokenize(input_program)

print(input_program_tokens )

# Create variables to store the roles

RE_Keywords="auto|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|goto|if|int|long|register|return|short|signed|sizeof|static|struct|switch|typedef|union|unsigned|void|volatile|while|string|class|struc|include"
RE_Operators = "(\++)|(-)|(=)|(\*)|(/)|(%)|(--)|(<=)|(>=)"
RE_Numerals = "^(\d+)$"
RE_Special_Characters = "[\[@&~!#$\^\|{}\]:;<>?,\.']|\(\)|\(|\)|{}|\[\]|\""
RE_Identifiers = "^[a-zA-Z_]+[a-zA-Z0-9_]*"


#To Categorize The Tokens

for token in input_program_tokens:
    if(re.findall(RE_Keywords,token)):
        print(token , "-------> Keyword")
    elif(re.findall(RE_Operators,token)):
        print(token, "-------> Operator")
    elif(re.findall(RE_Numerals,token)):
        print(token, "-------> Numeral")
    elif(re.findall(RE_Special_Characters,token)):
        print(token, "-------> Special Character/Symbol")
    elif(re.findall(RE_Identifiers,token)):
        print(token, "-------> Identifiers")
    else:
        print("Unknown Value")


Enter Your Code: for ( int i=0;i<5;i+1)
['for', '(', 'int', 'i', '=', '0', ';', 'i', '<', '5', ';', 'i', '+', '1', ')']
for -------> Keyword
( -------> Special Character/Symbol
int -------> Keyword
i -------> Identifiers
= -------> Operator
0 -------> Numeral
; -------> Special Character/Symbol
i -------> Identifiers
< -------> Special Character/Symbol
5 -------> Numeral
; -------> Special Character/Symbol
i -------> Identifiers
+ -------> Operator
1 -------> Numeral
) -------> Special Character/Symbol
