# Two Dimensional Pattern Matching



In [1]:
from queue import Queue
from PIL import Image
from time import perf_counter
import numpy as np

# First we need to find a way of building the automaton
### To do it we will use a Node class

In [2]:
class Node:

    def __init__(self, data=0, parent=None, terminal=False):
        # each node will store the root of the trie it belongs to
        # We will use the root to kep track of the number of nodes in the tree
        # we will create multiple tries so we can't simply use a static variable
        if parent == None:
            self.root = self
        else:
            self.root = parent.root
        # data field will store the node's letter if the node isn't the root
        # otherwise it will store the number nodes in the tree
        self.data = data
        # it might be hard to believe but this field stores the reference to node's parent node
        self.parent = parent
        # boolean value - is the node a leaf
        self.terminal = terminal
        # children dict
        self.children = {}
        # transitions will be automata's vertexes
        self.transition = {}
        self.id = self.root.data
        self.root.data += 1

### Then we'll build the trie structure that we will base our automaton on

In [3]:
# Sligthly modified code from the lecture
def create_trie(patterns):
    root = Node()
    for pattern in patterns:
        node = root
        for i in range(len(pattern)):
            if pattern[i] in node.children:
                node = node.children[pattern[i]]
            else:
                new_node = Node(pattern[i], node, False)
                node.children[pattern[i]] = new_node
                node = new_node
            if i == len(pattern) - 1:
                node.terminal = True
    return root

### To create an automaton and to process the text we'll need to extract some information about it. Specifically the alphabet and length of the longest line

In [4]:
def get_text_info(text):
    alphabet = set()
    max_len = 0
    for line in text:
        max_len = max(
            len(line), max_len
        )  # we'll need this for data with different line lengths
        for letter in line:
            alphabet.add(letter)
    return alphabet, max_len

### After that we'll only need to create the automaton out of the trie structure

In [5]:
# Modified code from the lecture
def multi_sma(root, alphabet):

    result = []
    if root.terminal:
        result.append(root)

    t = [{} for i in range(root.data + 1)]

    queue = Queue()
    for child in root.children.values():
        queue.put(child)
    for l in alphabet:
        root.transition[l] = root
        t[root.id][l] = root.id

    while not queue.empty():

        node = queue.get()
        parent = node.parent
        letter = node.data

        for child in node.children.values():
            queue.put(child)

        prev_node = parent.transition[letter]
        parent.transition[letter] = node
        t[parent.id][letter] = node.id

        if node.terminal or prev_node.terminal:
            result.append(node.id)

        for l in alphabet:
            if l in prev_node.children:
                node.transition[l] = prev_node.children[l]
                t[node.id][l] = prev_node.children[l].id
            else:
                node.transition[l] = prev_node.transition[l]
                t[node.id][l] = t[prev_node.id][l]

    return t, result

### We will also need a function that will translate the text onto array of automata states

In [6]:
def create_state_array(arr, alphabet, transition, max_len=None):
    if max_len is None:
        max_len = len(arr[0])

    state_array = [[None] * max_len for i in range(len(arr))]
    for j in range(max_len):
        state = 0
        for i in range(len(arr)):
            if len(arr[i]) > j:
                state = transition[state][arr[i][j]]
            else:
                state = 0
            state_array[i][j] = state

    return state_array

### And a function that will translate our set of patterns to sequence of terminal states 
(We'll try to find this sequence in the array of states generatet by the previous function)

In [7]:
def get_pattern_states(patterns, transition):
    result = []
    for pattern in patterns:
        state = 0
        for letter in pattern:
            state = transition[state][letter]
        result.append(state)
    return result

# With these tools we can proceed to solving given tasks

### Task 1
To solve task 1 we'll simply put our previous functions to use

In [8]:
def find_in_matrix(pattern, text):

    t0 = perf_counter()

    # automata creation
    alphabet, max_len = get_text_info(text)
    root = create_trie(pattern)
    transition, _ = multi_sma(root, alphabet)

    # translating the text to array of states
    state_array = create_state_array(text, alphabet, transition)

    # translating the pattern to one dimensional list that we'll search for in the state array
    pattern_states = get_pattern_states(pattern, transition)

    t1 = perf_counter()

    # creating an automata for pattern_states
    pattern_root = create_trie([pattern_states])
    pattern_transition, _ = multi_sma(pattern_root, set(pattern_states))

    # searching for the pattern
    result = []
    for i in range(len(state_array)):
        state = 0
        for j in range(len(state_array[0])):
            state = pattern_transition[state].setdefault(state_array[i][j], 0)
            if state == len(pattern_states) - 1:
                result.append((i, j))

    t2 = perf_counter()
    return result, t1 - t0, t2 - t1

### Task 2
Firstly we will extract the information about our text

In [9]:
file = open("haystack.txt", "r")
text = file.readlines()
file.close()
alphabet, max_len = get_text_info(text)

We can't directly use the code from task 1 as we need to find several different two dimensional patterns. However we can still use previous functions to create a multipple pattern matching automata

In [10]:
# Our algorithms search the code in top down order so 2 identical characters (#) in the same column will make "##" pattern
patterns = []
for letter in alphabet:
    patterns.append(letter + letter)  # python string concatenation

root = create_trie(patterns)
transition, pattern_match = multi_sma(root, alphabet)
state_array = create_state_array(text, alphabet, transition, max_len)

let's notice that when creating a trie structure we initialize nodes follwing DFS graph traverse. So since the root's id is 0 and each leaf is in the distance of 2 from the root, we can be certain that leave nodes' id-s will always be even numbers (but different that 0) and internal node's will have odd id-s

In [11]:
result = []
for i in range(len(state_array)):
    for j in range(len(state_array[i])):
        if (
            state_array[i][j] % 2 == 0 and state_array[i][j] != 0
        ):  # pattern is found if the state is an even and different than 0
            result.append((i, j))
print(result[:10], len(result))

[(1, 7), (1, 10), (1, 27), (1, 58), (1, 63), (1, 73), (1, 82), (1, 83), (2, 2), (2, 3)] 394


We can use simpler algorithm that will read the text linearly, with each character being read twice which in this tasks is a better and simpler solution since the SMA based one needs to read the text to dermine the alphabet which forces it to read the text twice anyway

In [12]:
result = []
for i in range(len(text) - 1):
    for j in range(len(text[i])):
        if len(text[i + 1]) > j and text[i][j] == text[i + 1][j]:
            result.append((i + 1, j))
print(result[:10], len(result))

[(1, 7), (1, 10), (1, 27), (1, 58), (1, 63), (1, 73), (1, 82), (1, 83), (2, 2), (2, 3)] 394


### Task 2
In task 2 we will need 3 different patterns. "tt", "  " and "hh"

In [13]:
patterns = ["tt", "  ", "hh"]

root = create_trie(patterns)
transition, pattern_match = multi_sma(root, alphabet)

pattern_states = get_pattern_states(patterns, transition)
print(pattern_states)  # equal to [2,4,6]
# so finding "t h" is equivalent to finding [2,4,6] pattern in the state array and finding "th" to finding [2,6]

state_array = create_state_array(text, alphabet, transition, max_len)

[2, 4, 6]


now let's find our patterns. We cen create a simple automaton for finding 2 patterns

In [14]:
patterns_b = [
    [pattern_states[0], pattern_states[1], pattern_states[2]],
    [pattern_states[0], pattern_states[2]],
]

# creating multiple pattern searching automaton
root_b = create_trie(patterns_b)
transition_b, pattern_match_b = multi_sma(root_b, set([2, 4, 6]))

state = 0
result = []
for i in range(len(state_array)):
    for j in range(max_len):
        state = transition_b[state].setdefault(state_array[i][j], 0)
        if state == pattern_match_b[0] or state == pattern_match_b[1]:
            result.append((i, j))

print(result)

[(38, 2)]


# Tasks 4-7
To process images we'll need to translate them to an arrays. Please note that the image that we'll be searching for must be transposed as each pattern that is vertical in the image must be horizontal in the python list

In [15]:
def image_to_matrix(image, transposing=False):
    image_map = image.load()
    result = []  # resulting array

    # width and hight. Swapped if need be
    h, w = image.height, image.width
    if transposing:
        h, w = w, h

    for i in range(h):
        row = []
        for j in range(w):
            if transposing:
                row.append(image_map[i, j][0])
            else:
                row.append(image_map[j, i][0])
        result.append(row)
    return result

In [16]:
# loading images
p_haystack = Image.open("haystack.png")
haystack = image_to_matrix(p_haystack)
p_haystack.close

p_pattern = Image.open("pattern.png")
pattern = image_to_matrix(p_pattern, transposing=True)
p_pattern.close()

p_t, p_h, p_a, p_n = (
    Image.open("t.png"),
    Image.open("h.png"),
    Image.open("h.png"),
    Image.open("n.png"),
)
t = image_to_matrix(p_t, transposing=True)
h = image_to_matrix(p_h, transposing=True)
a = image_to_matrix(p_a, transposing=True)
n = image_to_matrix(p_n, transposing=True)
p_t.close()
p_h.close()
p_a.close()
p_n.close()

p_tc1, p_tc2, p_tc3 = (
    Image.open("tc1.png"),
    Image.open("tc2.png"),
    Image.open("tc3.png"),
)
tc1 = image_to_matrix(p_tc1, transposing=True)
tc2 = image_to_matrix(p_tc2, transposing=True)
tc3 = image_to_matrix(p_tc3, transposing=True)
p_tc1.close()
p_tc2.close()
p_tc3.close()

p_haystack_t = Image.open("haystack.png")
haystack_t = image_to_matrix(p_haystack_t, transposing=True)
p_haystack_t.close()

### Task 4
We can finally use the function from task 1 to simply search for pattern array in image array

In [17]:
t_results = find_in_matrix(t, haystack)
h_results = find_in_matrix(h, haystack)
a_results = find_in_matrix(a, haystack)
n_results = find_in_matrix(n, haystack)

# printing number of results and first 10 results
print(len(t_results[0]), t_results[0][: min(10, len(t_results[0]))])
print(len(h_results[0]), h_results[0][: min(10, len(h_results[0]))])
print(len(a_results[0]), a_results[0][: min(10, len(a_results[0]))])
print(len(n_results[0]), n_results[0][: min(10, len(n_results[0]))])

388 [(47, 96), (47, 285), (47, 356), (47, 503), (47, 621), (47, 639), (69, 81), (69, 123), (69, 152), (69, 222)]
179 [(47, 106), (69, 388), (69, 546), (69, 633), (91, 396), (91, 524), (113, 195), (113, 673), (113, 699), (135, 230)]
179 [(47, 106), (69, 388), (69, 546), (69, 633), (91, 396), (91, 524), (113, 195), (113, 673), (113, 699), (135, 230)]
331 [(47, 48), (47, 226), (47, 254), (47, 442), (47, 529), (47, 613), (47, 665), (47, 770), (69, 109), (69, 719)]


### Task 5
We can do exactly the same to find larger patterns

In [18]:
print(find_in_matrix(pattern, haystack)[0])

[(491, 282), (513, 526), (557, 340), (601, 240), (645, 346)]


### Task 6
Our function returns not only the lower left corner coordintes for the searched pattern but also the time of automata creation phase and the time of the pattern searching phase

In [19]:
time_compare_1 = find_in_matrix(tc1, haystack)
time_compare_2 = find_in_matrix(tc2, haystack)
time_compare_3 = find_in_matrix(tc3, haystack)

In [20]:
print(
    f"times for {len(tc1[0])} x {len(tc1)} pixels :\n {round(time_compare_1[1],5)}s and {round(time_compare_1[2],5)}s\n"
)
print(
    f"times for {len(tc2[0])} x {len(tc2)} pixels :\n {round(time_compare_2[1],5)}s and {round(time_compare_2[2],5)}s\n"
)
print(
    f"times for {len(tc3[0])} x {len(tc3)} pixels :\n {round(time_compare_3[1],5)}s and {round(time_compare_3[2],5)}s\n"
)

times for 201 x 265 pixels :
 5.0608s and 0.41692s

times for 85 x 116 pixels :
 1.87262s and 0.33925s

times for 58 x 80 pixels :
 0.9549s and 0.35173s


### Task 7
To slice the image we can use numpy library's intuitive list comprehensions

In [21]:
half = np.array(haystack_t)[:, : len(haystack_t[0]) // 2]
quarter = np.array(haystack_t)[:, : len(haystack_t[0]) // 4]
eighth = np.array(haystack_t)[:, : len(haystack_t[0]) // 8]

In [22]:
half_time = find_in_matrix(half, haystack)
quarter_time = find_in_matrix(quarter, haystack)
eighth_time = find_in_matrix(eighth, haystack)

In [23]:
print(
    f"search time for {len(half[0])} x {len(half)} pixels :\n {round(half_time[2],5)}s\n"
)
print(
    f"search time for {len(quarter[0])} x {len(quarter)} pixels :\n {round(quarter_time[2],5)}s\n"
)
print(
    f"search time for {len(eighth[0])} x {len(eighth)} pixels :\n {round(eighth_time[2],5)}s\n"
)

search time for 950 x 860 pixels :
 1.16823s

search time for 475 x 860 pixels :
 2.58366s

search time for 237 x 860 pixels :
 1.18434s


In [24]:
print(
    f"automata building time for {len(half[0])} x {len(half)} pixels :\n {round(half_time[1],5)}s\n"
)
print(
    f"automata building for {len(quarter[0])} x {len(quarter)} pixels :\n {round(quarter_time[1],5)}s\n"
)
print(
    f"automata building for {len(eighth[0])} x {len(eighth)} pixels :\n {round(eighth_time[1],5)}s\n"
)

automata building time for 950 x 860 pixels :
 123.67523s

automata building for 475 x 860 pixels :
 75.209s

automata building for 237 x 860 pixels :
 21.62053s
