Requirement 1

In [124]:
import re
import json

In [121]:
def is_valid_regex(regex):
    """
    parameters :
    regex : check this regex if it valid or not
    return : 
    bool : True if it valid, and False if it invalid
    """
    try:
        re.compile(regex)
        return True
    except:
        return False

def add_concat(regex, operators):
    """
    add concatenate operator to given regex
    parameters :
    regex : valid regex
    operators : valid operators in regex 
    return : 
    regex_with_concat : valid regex with concatenate operator
    """
    regex_with_concat = []
    i = 0
    while i < len(regex) - 1:
        regex_with_concat.append(regex[i])
        if regex[i] not in operators:
            if regex[i + 1] not in operators or regex[i + 1] == '(' or regex[i + 1] == '[':
                regex_with_concat += '.'
        if (regex[i] == ')' or regex[i] == ']') and (regex[i + 1] == '(' or regex[i + 1] == '['):
            regex_with_concat += '.'
        if (regex[i] == '*' or regex[i] == '+' or regex[i] == '?') and (regex[i + 1] == '(' or regex[i + 1] == '['):
            regex_with_concat += '.'
        if (regex[i] == '*' or regex[i] == '+' or regex[i] == '?') and regex[i + 1] not in operators:
            regex_with_concat += '.'
        if regex[i] == ')' and regex[i + 1] not in operators:
            regex_with_concat += '.'
        if regex[i] == '[':
            while regex[i+1] != ']':
                i += 1
                regex_with_concat.append(regex[i])
        i += 1

    regex_with_concat += regex[len(regex) - 1]
    return regex_with_concat

def comp_precedence(a, b):
    """
    parameters :
    a : first operator
    b : second operator
    return :
    bool : True if a has high precedence false otherwise
    """
    p = ["|", ".", "-", "*", "+", "?"]
    return p.index(a) > p.index(b)

def creat_postfix_regex(regex, operators):
    """
    apply Shunting-Yard algorithm to creat postfix regex
    parameters :
    regex : valid regex with concatenate operator
    operators : valid operators in regex 
    return : 
    postfix_regex : list of char that represente postfix regex 
    """
    postfix_regex = ""
    operator_stack = []

    for c in regex:
        if c not in operators or c == "*" or c == "+" or c == "?" or c == "[" or c == "]":
            postfix_regex += c
        elif c == ")" or c == "]":
            while len(operator_stack) > 0 and operator_stack[-1] != "(" and operator_stack[-1] != "[":
                postfix_regex += operator_stack.pop()
            operator_stack.pop()
        elif c == "[" or c == "(" or len(operator_stack) == 0 or operator_stack[-1] == "(" or operator_stack[-1] == "[" or comp_precedence(c, operator_stack[-1]):
            operator_stack.append(c)
        else:
            while len(operator_stack) > 0 and operator_stack[-1] != "(" and operator_stack[-1] != "[" and not comp_precedence(c, operator_stack[-1]):
                postfix_regex += operator_stack.pop()
            operator_stack.append(c)

    while len(operator_stack) > 0:
        postfix_regex += operator_stack.pop()

    return postfix_regex


In [128]:
class state:
    def __init__(self, id, is_start, is_treminal, next_states):
        self.id = id
        self.is_start = is_start
        self.is_treminal = is_treminal
        self.next_states = next_states
    def print_state(self):
        print("state ", self.id)
        for out in self.next_states:
            print("on ", out, "go to", self.next_states[out])
        if self.is_start:
            print("state is a start state")
        if self.is_treminal:
            print("state is a terminal state")

In [131]:
def creat_nfa(postfix_regex, operators):
    """
    convert postfix regex to finite state machine using Thompson's Construction algorithm for NFAs
    parameters :
    postfix_regex : postfix regex (output from Shunting-Yard algorithm)
    operators : valid operators in regex 
    return : 
    states : array holds all states that formulate NFA 
    """
    i = 0
    states = []
    states_stack =[]
    while i < len(postfix_regex):
        if postfix_regex[i] not in operators:
            s1 = state(str(i) + str(0), True, False, {postfix_regex[i] : str(i) + str(1)})
            s2 = state(str(i) + str(1), False, True, {})
        else:
            if postfix_regex[i] == '[':
                next_states = {}
                state_id = i
                if len(postfix_regex) > i+4 and postfix_regex[i+4] == '-':
                    for j in range(ord(postfix_regex[i+1]), ord(postfix_regex[i+2]) + 1):
                        next_states[chr(j)] = str(state_id) + str(1)
                    i += 4 
                else:
                    while postfix_regex[i+1] != ']':
                        i += 1
                        next_states[postfix_regex[i]] = str(state_id) + str(1)
                    i += 1
                s1 = state(str(state_id) + str(0), True, False, next_states)
                s2 = state(str(state_id) + str(1), False, True, {})

            if postfix_regex[i] == '|':
                second_nfa_state = states_stack.pop()
                first_nfa_state = states_stack.pop()
                s1 = state(str(i) + str(0), True, False, {"epsilon" : [first_nfa_state[0].id, second_nfa_state[0].id]})
                s2 = state(str(i) + str(1), False, True, {})
                # add epsilon
                if "epsilon" in first_nfa_state[1].next_states:
                    first_nfa_state[1].next_states["epsilon"].append(s2.id)
                else:
                    first_nfa_state[1].next_states["epsilon"] = [s2.id]
                if "epsilon" in second_nfa_state[1].next_states:
                    second_nfa_state[1].next_states["epsilon"].append(s2.id)
                else:
                    second_nfa_state[1].next_states["epsilon"] = [s2.id]
                # make it non trminal
                first_nfa_state[1].is_treminal = False
                second_nfa_state[1].is_treminal = False

                # make it non start state
                first_nfa_state[0].is_start = False
                second_nfa_state[0].is_start = False

            if postfix_regex[i] == '.':
                second_nfa_state = states_stack.pop()
                first_nfa_state = states_stack.pop()
                s1 = state(str(i) + str(0), True, False, {"epsilon" : [first_nfa_state[0].id]})
                s2 = state(str(i) + str(1), False, True, {})
                
                # add epsilon
                if "epsilon" in first_nfa_state[1].next_states:
                    first_nfa_state[1].next_states["epsilon"].append(second_nfa_state[0].id)
                else:
                    first_nfa_state[1].next_states["epsilon"] = [second_nfa_state[0].id]
                if "epsilon" in second_nfa_state[1].next_states:
                    second_nfa_state[1].next_states["epsilon"].append(s2.id)
                else:
                    second_nfa_state[1].next_states["epsilon"] = [s2.id]
                
                # make it non trminal
                first_nfa_state[1].is_treminal = False
                second_nfa_state[1].is_treminal = False
                
                # make it non start state
                first_nfa_state[0].is_start = False
                second_nfa_state[0].is_start = False

            if postfix_regex[i] == '*':
                nfa_state = states_stack.pop()
                s1 = state(str(i) + str(0), True, False, {"epsilon" : [nfa_state[0].id, str(i) + str(1)]})
                s2 = state(str(i) + str(1), False, True, {})
                
                # add epsilon
                if "epsilon" in nfa_state[1].next_states:
                    nfa_state[1].next_states["epsilon"].append(nfa_state[0].id)
                else:
                    nfa_state[1].next_states["epsilon"] = [nfa_state[0].id]
                nfa_state[1].next_states["epsilon"].append(s2.id)
                
                # make it non trminal
                nfa_state[1].is_treminal = False

                # make it non start state
                nfa_state[0].is_start = False

            if postfix_regex[i] == '+':
                nfa_state = states_stack.pop()
                s1 = state(str(i) + str(0), True, False, {"epsilon" : [nfa_state[0].id]})
                s2 = state(str(i) + str(1), False, True, {})
                
                # add epsilon
                if "epsilon" in nfa_state[1].next_states:
                    nfa_state[1].next_states["epsilon"].append(nfa_state[0].id)
                else:
                    nfa_state[1].next_states["epsilon"] = [nfa_state[0].id]
                nfa_state[1].next_states["epsilon"].append(s2.id)
                
                # make it non trminal
                nfa_state[1].is_treminal = False

                # make it non start state
                nfa_state[0].is_start = False
            
            if postfix_regex[i] == '?':
                nfa_state = states_stack.pop()
                s1 = state(str(i) + str(0), True, False, {"epsilon" : [nfa_state[0].id, str(i) + str(1)]})
                s2 = state(str(i) + str(1), False, True, {})
                
                # add epsilon
                if "epsilon" in nfa_state[1].next_states:
                    nfa_state[1].next_states["epsilon"].append(s2.id)
                else:
                    nfa_state[1].next_states["epsilon"] = [s2.id]
                
                # make it non trminal
                nfa_state[1].is_treminal = False

                # make it non start state
                nfa_state[0].is_start = False
        
        states_stack.append((s1, s2))
        states.append(s1)
        states.append(s2)
        i += 1
    return states

def write_states_json(states):
    """
    write all NFA states in json file
    parameters :
    states : array holds all states that formulate NFA 
    """
    data = {
    "startingState": "bla",
    }

    for state in states:
        if state.is_start:
            data["startingState"] = state.id
        state_data = {
            "isTerminatingState": state.is_treminal
        }
        for next_state in state.next_states:
            state_data["inputCharacter" + next_state.upper()] = state.next_states[next_state]
        data["state" + state.id] = state_data

    with open("NFA.json", "w") as f:
        json.dump(data, f)

def re_to_nfa(regex):
    """
    convert regex to NFA
    parameters :
    regex : input regex 
    """
    if not is_valid_regex(regex):
        return "error, invalid regex"
    operators = ['|', '+', '*', '?', '-', '(', ')', '[', ']', '.']
    regex_with_concat = add_concat(regex, operators)
    postfix_regex = creat_postfix_regex(regex_with_concat, operators)
    states = creat_nfa(postfix_regex, operators)
    write_states_json(states)
    

##### try the algorithm and enter any regex to convert it to NFA

In [None]:
re_to_nfa("(a|b)[a-d]")

In [None]:
def nfa_to_dfa():