In [None]:
from queue import Queue
from typing import Dict, List, Set, TypeVar
from PIL import Image
X = TypeVar('X')
from time import perf_counter

1. Zaimplementuj algorytm wyszukiwania wzorca 2-wymiarowego

In [None]:
def image_to_matrix(img: Image):
    pixel_map = img.load()
    pixels = []
    for row in range(img.height):
        pix = []
        for col in range(img.width):
            pix.append(pixel_map[col, row][0])
        pixels.append(pix)
    return pixels


class Node:
    def __init__(self) -> None:
        super().__init__()
        self.state: int = 0
        self.fail: 'Node' = None
        self. transitions: Dict[X, 'Node'] = {}


class Automaton:
    def __init__(self, pattern: List[List[X]]) -> None:
        super().__init__()
        self.final_states: List[int] = []
        self.final_states_automaton: Dict[int, List[int]] = {}
        self.trie: Node = Node()
        self.current_state: Node = self.trie
        self.pattern = pattern

        state_counter = 0
        self.trie.state = state_counter
        state_counter += 1

        alphabet: Set[X] = set()

        for i in range(len(pattern[0])):
            temp = self.trie
            for j in range(len(pattern)):
                if pattern[j][i] not in temp.transitions:
                    alphabet.add(pattern[j][i])
                    temp.transitions[pattern[j][i]] = Node()
                    temp.transitions[pattern[j][i]].state = state_counter
                    state_counter += 1
                temp = temp.transitions[pattern[j][i]]

        q: Queue[Node] = Queue()

        for letter in alphabet:
            if letter in self.trie.transitions:
                self.trie.transitions[letter].fail = self.trie
                q.put(self.trie.transitions[letter])
            else:
                self.trie.transitions[letter] = self.trie

        while not q.empty():
            current_node = q.get()
            for letter in alphabet:
                if letter in current_node.transitions:
                    next_node = current_node.transitions[letter]
                    q.put(next_node)
                    temp = current_node.fail
                    while letter not in temp.transitions:
                        temp = temp.fail
                    next_node.fail = temp.transitions[letter]

        self.compute_final_states(pattern)
        self.compute_final_states_automaton()

    def read_char(self, letter: X):
        while letter not in self.current_state.transitions.keys():
            self.current_state = self.current_state.fail
            if self.current_state is None:
                self.current_state = self.trie
                return self.current_state.state
        self.current_state = self.current_state.transitions[letter]
        return self.current_state.state

    def rollback(self):
        self.current_state = self.trie

    def compute_final_states(self, pattern: List[List[X]]):
        for i in range(len(pattern[0])):
            self.final_states.append(0)
            for j in range(len(pattern)):
                self.final_states[-1] = self.read_char(pattern[j][i])
            self.rollback()

    def compute_final_states_automaton(self):
        for state in self.final_states:
            if state not in self.final_states_automaton.keys():
                self.final_states_automaton[state] = [0] * (len(self.final_states) + 1)
        long_ps = 0
        self.final_states_automaton[self.final_states[0]][0] = 1
        for i in range(len(self.final_states_automaton)):
            for state in self.final_states_automaton.values():
                state[i] = state[long_ps]
            if i < len(self.final_states):
                self.final_states_automaton[self.final_states[i]][i] = i + 1
                long_ps = self.final_states_automaton[self.final_states[i]][long_ps]

    def parse_line(self, line: List[int]):
        result: List[int] = []
        state = 0
        for i in range(len(line)):
            if line[i] not in self.final_states_automaton:
                state = 0
                continue
            state = self.final_states_automaton[line[i]][state]
            if state == len(self.final_states):
                result.append(i)
        return result

    def find(self, text: List[List[X]]):
        result: List[(int, List[int])] = []
        length = 0
        automaton_output: List[List[int]] = []
        for word in text:
            length = max(length, len(word))
            automaton_output.append([])

        for i in range(length):
            for j in range(len(text)):
                if i < len(text[j]):
                    automaton_output[j].append(self.read_char(text[j][i]))
            self.rollback()

        for i in range(len(automaton_output)):
            temp: List[int] = self.parse_line(automaton_output[i])
            if len(temp) != 0:
                result.append((i, temp))
        return [(z[0] - len(self.pattern) + 1, y - len(self.pattern[0]) + 1) for z in result for y in z[1]]

2. Znajdź w załączonym pliku "haystack.txt" wszyskie sytuacje, gdy taka sama litera występuje na tej samej pozycji w dwóch kolejnych linijkach. Zwróć uwagę, na nierówną długość linii w pliku.

In [None]:
with open('haystack.txt', 'r') as file:
    haystack = file.readlines()
    for i in range(len(haystack)):
        haystack[i] = list(haystack[i])
    alphabet = set()
    for row in haystack:
        for letter in row:
            alphabet.add(letter)
    result = []
    for letter in alphabet:
        result.extend(Automaton([[letter], [letter]]).find(haystack))
    print(result)

3. Znajdź wszystkie wystąpienia "th" oraz "t h" w dwóch kolejnych liniach na tej samej pozycji.

In [None]:
with open('haystack.txt', 'r') as file:
    haystack = file.readlines()
    for i in range(len(haystack)):
        haystack[i] = list(haystack[i])
    print("th")
    print("th")
    print(Automaton([['t', 'h'], ['t', 'h']]).find(haystack))
    print("t h")
    print("t h")
    print(Automaton([['t', ' ', 'h'], ['t', ' ', 'h']]).find(haystack))

4. Wybierz przynajmniej 4 litery (małe). Znajdź wszystkie wystąpienia tej litery w załączonym pliku "haystack.png"

In [None]:
with Image.open('haystack.png') as img, \
        Image.open('patterns/s.png') as s_img, \
        Image.open('patterns/i.png') as i_img, \
        Image.open('patterns/m.png') as m_img, \
        Image.open('patterns/p.png') as p_img:
    haystack = image_to_matrix(img)
    s = image_to_matrix(s_img)
    i = image_to_matrix(i_img)
    m = image_to_matrix(m_img)
    p = image_to_matrix(p_img)
    print("s:")
    print(Automaton(s).find(haystack))
    print("i:")
    print(Automaton(i).find(haystack))
    print("m:")
    print(Automaton(m).find(haystack))
    print("p:")
    print(Automaton(p).find(haystack))

5. Znajdź wszystkie wystąpienia słowa "p a t t e r n" w haystack.png.

In [None]:
with Image.open('haystack.png') as img, \
        Image.open('patterns/pattern.png') as pattern_img:
    haystack = image_to_matrix(img)
    pattern = image_to_matrix(pattern_img)
    print(Automaton(pattern).find(haystack))

6. Porównaj czas budowania automatu i czas wyszukiwania dla różnych rozmiarów wzorca

In [None]:
def time_test(text, pattern):
    start = perf_counter()
    automaton = Automaton(pattern)
    end = perf_counter()
    print(f"automaton: {end-start}")
    start = perf_counter()
    automaton.find(text)
    end = perf_counter()
    print(f"search: {end-start}")

with Image.open('haystack.png') as img, \
        Image.open('patterns/small.png') as small_img, \
        Image.open('patterns/medium.png') as medium_img, \
        Image.open('patterns/large.png') as large_img:
    haystack = image_to_matrix(img)
    small = image_to_matrix(small_img)
    medium = image_to_matrix(medium_img)
    large = image_to_matrix(large_img)
    print("small:")
    time_test(haystack, small)
    print("medium:")
    time_test(haystack, medium)
    print("large:")
    time_test(haystack, large)

7. Podziel plik na 2, 4 i 8 fragmentów (w poziomie) i porównaj czas przeszukiwania

In [None]:
def chunker_list(seq, size):
    return (seq[i::size] for i in range(size))

with Image.open('haystack.png') as img, \
        Image.open('patterns/pattern.png') as pattern_img:
    haystack = image_to_matrix(img)
    pattern = image_to_matrix(pattern_img)
    automaton = Automaton(pattern)
    start = perf_counter()
    for text in chunker_list(haystack, 2):
        automaton.find(text)
    end = perf_counter()
    print(f"two parts: {end-start}")
    start = perf_counter()
    for text in chunker_list(haystack, 4):
        automaton.find(text)
    end = perf_counter()
    print(f"four parts: {end-start}")
    start = perf_counter()
    for text in chunker_list(haystack, 8):
        automaton.find(text)
    end = perf_counter()
    print(f"eight parts: {end-start}")