In [13]:
from random import randrange

In [102]:
test_words = [
    "bbb$",
    "aabbabd$",
    "ababcd$",
    "abaababaabaabaabab$",
    "".join([chr(randrange(97, 98)) for _ in range(1000)])]

with open("1997_714_head.txt", "r", encoding='UTF-8') as f:
    test_words.append(f.read())

test_words[-1] = test_words[-1] + '$'
test_words[-2] = test_words[-2] + '$'

In [92]:
def build_kmp_table(pattern):
    m = len(pattern)
    pi = [0 for _ in range(m)]
    k = 0
    for i in range(1, m):
        while k > 0 and pattern[i] != pattern[k]:
            k = pi[k-1]
        if pattern[i] == pattern[k]:
            k += 1
        pi[i] = k
    return pi

def kmp(pattern, text):
    m = len(pattern)
    pi = build_kmp_table(pattern)
    ans = []
    i = 0
    j = 0
    while j < len(text):
        while i > 0 and pattern[i] != text[j]:
            i = pi[i - 1]
        if pattern[i] == text[j]:
            i += 1
        if i == m:
            ans.append(j)
            i = pi[-1]
        j += 1
    return ans

In [93]:
from time import perf_counter

def timeit(func):
    def wrapper(*args, **kwargs):
        t1 = perf_counter()
        res = func(*args, **kwargs)
        t2 = perf_counter()
        return res, round(t2 - t1, 4)
    return wrapper

In [94]:
from abc import ABC, abstractmethod

class Tree(ABC):
    def __init__(self, text):
        self.text = text
        self.root = None

    @abstractmethod
    def find_word(self, word):
        pass

    @abstractmethod
    def initiate_trie(self, *args):
        pass

In [95]:
from math import inf

class TrieNode:
    count = 0
    def __init__(self, begin=0, length=inf, link=None):
        self.begin = begin
        self.length = length
        self.edges = {}
        self.suffix_link = link

def add_word(root, word):
    cur = root
    for letter in word:
        if letter not in cur.edges:
            cur.edges[letter] = TrieNode()
        cur = cur.edges[letter]

@timeit
def build_regular_trie(text):
    root = TrieNode()
    for i in range(len(text)):
        add_word(root, text[i:])
    return root


def find_word_trie(root, word):
    cur = root
    for x in word:
        if x not in cur.edges:
            return False
        cur = cur.edges[x]
    return True

class RegularTrie(Tree):
    def find_word(self, word):
        return find_word_trie(self.root, word)

    def initiate_trie(self):
        self.root, time = build_regular_trie(self.text)
        return time

In [96]:
@timeit
def build_linked_trie(text):
    root = TrieNode()
    deepest = root
    for letter in text:
        cur = deepest
        prev = deepest = cur.edges[letter] = TrieNode(link=root)
        cur = cur.suffix_link
        while cur is not None:
            if letter not in cur.edges:
                cur.edges[letter] = TrieNode(link=root)
            prev.suffix_link = cur.edges[letter]
            prev = cur.edges[letter]
            cur = cur.suffix_link
    return root

class LinkedTrie(Tree):
    def find_word(self, word):
        return find_word_trie(self.root, word)

    def initiate_trie(self):
        self.root, time = build_linked_trie(self.text)
        return time

In [97]:
def add_suffix(node, ind, text):
    length = 0
    edge = None
    for i in range(ind, len(text)):
        letter = text[i]
        if length == 0:
            if letter not in node.edges:
                node.edges[letter] = TrieNode(i)
                return
            edge = node.edges[letter]
        if text[edge.begin + length] != letter:
            break
        length += 1
        if length >= edge.length:
            node = edge
            length = 0
    else:
        return

    new_node = TrieNode(edge.begin, length)
    node.edges[text[edge.begin]] = new_node
    new_node.edges[text[edge.begin + length]] = edge
    new_node.edges[text[i]] = TrieNode(i)
    edge.begin = edge.begin + length
    edge.length -= length


@timeit
def build_suffix_tree(text):
    root = TrieNode()
    for i in range(len(text)):
        add_suffix(root, i, text)
    return root


def find_word(node, text, word):
    length = 0
    edge = None
    for letter in word:
        if length == 0:
            if letter not in node.edges:
                return False
            edge = node.edges[letter]
        if text[edge.begin + length] != letter:
            return False
        length += 1
        if length >= edge.length:
            node = edge
            length = 0
    return True


class SuffixTrie(Tree):
    def find_word(self, word):
        return find_word(self.root, self.text, word)

    def initiate_trie(self):
        self.root, time = build_suffix_tree(self.text)
        return time


In [87]:
def test_correctness(text, TreeType):
    tree = TreeType(text)
    build_time = tree.initiate_trie()
    print(f"Text size: {len(text)}, Build time: {build_time}")
    for i in range(100):
        left = randrange(len(text) - 1)
        right = randrange(left + 1, len(text))
        substring = text[left:right]
        if i % 2:
            m = len(substring) // 2
            substring = substring[:m] + 'a' + substring[m:]
        found = len(kmp(substring, text)) > 0
        assert tree.find_word(substring) == found, "Not OK"
    else:
        print("Tests passed\n")

In [69]:
for t in test_words:
    test_correctness(t, LinkedTrie)

Text size: 4, Build time: 0.0
Tests passed

Text size: 8, Build time: 0.0001
Tests passed

Text size: 7, Build time: 0.0
Tests passed

Text size: 19, Build time: 0.0001
Tests passed

Text size: 5001, Build time: 50.4947
Tests passed

Text size: 2482, Build time: 21.4934
Tests passed



In [59]:
for t in test_words:
    test_correctness(t, RegularTrie)

Text size: 4, Build time: 0.0
Tests passed

Text size: 8, Build time: 0.0
Tests passed

Text size: 7, Build time: 0.0
Tests passed

Text size: 19, Build time: 0.0003
Tests passed

Text size: 5001, Build time: 2.2362
Tests passed

Text size: 2482, Build time: 7.2508
Tests passed



In [98]:
for t in test_words:
    test_correctness(t, SuffixTrie)

Text size: 4, Build time: 0.0
Tests passed

Text size: 8, Build time: 0.0
Tests passed

Text size: 7, Build time: 0.0
Tests passed

Text size: 19, Build time: 0.0001
Tests passed

Text size: 5001, Build time: 3.1174
Tests passed

Text size: 2482, Build time: 0.0113
Tests passed



In [103]:
class ActivePoint:
    def __init__(self, root):
        self.tree_root = root
        self.length = 0
        self.edge = None
        self.node = root
        self.remainder = 0

def ukkonnen(text):
    root = TrieNode()
    pass

In [100]:
# class Node:
#         def __init__(self, begin, end, parent):
#             self.edges = {}
#             self.begin = begin
#             self.end = end
#             self.parent = parent
#
#         def __len__(self) -> int:
#             return self.end - self.begin
#
# @timeit
# def ukkonen(string: str) -> 'Node':
#     def split(node: 'Node', position: int) -> 'Node':
#         if position == len(node):
#             return node
#
#         new_node = Node(node.begin, node.begin + position, node.parent)
#         node.parent.edges[string[node.begin]] = new_node
#         new_node.edges[string[node.begin + position]] = node
#         node.parent = new_node
#         node.begin += position
#         return new_node
#
#     def suffix_link(node: 'Node') -> 'Node':
#         if node in suffix_links_table:
#             return suffix_links_table[node]
#
#         if node is root:
#             result = root
#         else:
#             link = suffix_link(node.parent)
#             result = split(*next_state(
#                 link, len(link),
#                 node.begin + (1 if node.parent is root else 0),
#                 node.end
#             ))
#
#         suffix_links_table[node] = result
#         return result
#
#     def next_state(node: 'Node', position: int, begin: int, end: int):
#
#         while begin < end:
#             edge_length = len(node)
#             if position == edge_length:
#                 if string[begin] in node.edges:
#                     node = node.edges[string[begin]]
#                     position = 0
#                 else:
#                     return None, None
#             else:
#                 if string[node.begin + position] != string[begin]:
#                     return None, None
#                 if end - begin < edge_length- position:
#                     return node, position + end - begin
#
#                 begin += edge_length - position
#                 position = edge_length
#
#         return node, position
#
#     root = Node(0, 0, None)
#     suffix_links_table = dict()
#     node = root
#     pos = 0
#     for index, character in enumerate(string):
#         while True:
#             node_, position_ = next_state(node, pos, index, index + 1)
#             if node_ is not None:
#                 node = node_
#                 pos = position_
#                 break
#
#             mid = split(node, pos)
#             leaf = Node(index, len(string), mid)
#             mid.edges[string[index]] = leaf
#
#             node = suffix_link(mid)
#             pos = len(node)
#
#             if mid is root:
#                 break
#     return root
#
# def find_word(suffix_tree, word, pattern):
#     position = 0
#     node = suffix_tree
#     for character in pattern:
#         if position < node.end:
#             if character != word[position]:
#                 return False
#             position += 1
#         else:
#             if character not in node.edges:
#                 return False
#             node = node.edges[character]
#             position = node.begin + 1
#     return True
#
#
# class SuffixTree(Tree):
#     def find_word(self, word):
#         return find_word(self.root, self.text, word)
#
#     def initiate_trie(self):
#         self.root, time = ukkonen(self.text)
#         return time

In [None]:
# for t in test_words:
#     test_correctness(t, SuffixTree)