In [13]:
from random import randrange

In [14]:
test_words = [
    "bbb$",
    "aabbabd$",
    "ababcd$",
    "abaababaabaabaabab$",
    "".join([chr(randrange(97, 99)) for _ in range(100)])]

with open("1997_714_head.txt", "r", encoding='UTF-8') as f:
    test_words.append(f.read())

test_words[-1] = test_words[-1] + '$'
test_words[-2] = test_words[-2] + '$'

In [15]:
def build_kmp_table(pattern):
    m = len(pattern)
    pi = [0 for _ in range(m)]
    k = 0
    for i in range(1, m):
        while k > 0 and pattern[i] != pattern[k]:
            k = pi[k-1]
        if pattern[i] == pattern[k]:
            k += 1
        pi[i] = k
    return pi

def kmp(pattern, text):
    m = len(pattern)
    pi = build_kmp_table(pattern)
    ans = []
    i = 0
    j = 0
    while j < len(text):
        while i > 0 and pattern[i] != text[j]:
            i = pi[i - 1]
        if pattern[i] == text[j]:
            i += 1
        if i == m:
            ans.append(j)
            i = pi[-1]
        j += 1
    return ans

In [16]:
from time import perf_counter

def timeit(func):
    def wrapper(*args, **kwargs):
        t1 = perf_counter()
        res = func(*args, **kwargs)
        t2 = perf_counter()
        return res, round(t2 - t1, 4)
    return wrapper

In [17]:
from abc import ABC, abstractmethod

class Tree(ABC):
    def __init__(self, text):
        self.text = text
        self.root = None

    @abstractmethod
    def find_word(self, word):
        pass

    @abstractmethod
    def initiate_trie(self, *args):
        pass

In [18]:
from math import inf

class TrieNode:
    count = 0
    def __init__(self, begin=0, length=inf, link=None):
        self.begin = begin
        self.length = length
        self.edges = {}
        self.suffix_link = link

def add_word(root, word):
    cur = root
    for letter in word:
        if letter not in cur.edges:
            cur.edges[letter] = TrieNode()
        cur = cur.edges[letter]

@timeit
def build_regular_trie(text):
    root = TrieNode()
    for i in range(len(text)):
        add_word(root, text[i:])
    return root


def find_word_trie(root, word):
    cur = root
    for x in word:
        if x not in cur.edges:
            return False
        cur = cur.edges[x]
    return True

class RegularTrie(Tree):
    def find_word(self, word):
        return find_word_trie(self.root, word)

    def initiate_trie(self):
        self.root, time = build_regular_trie(self.text)
        return time

In [19]:
@timeit
def build_linked_trie(text):
    root = TrieNode()
    deepest = root
    for letter in text:
        cur = deepest
        prev = deepest = cur.edges[letter] = TrieNode(link=root)
        cur = cur.suffix_link
        while cur is not None:
            if letter not in cur.edges:
                cur.edges[letter] = TrieNode(link=root)
            prev.suffix_link = cur.edges[letter]
            prev = cur.edges[letter]
            cur = cur.suffix_link
    return root

class LinkedTrie(Tree):
    def find_word(self, word):
        return find_word_trie(self.root, word)

    def initiate_trie(self):
        self.root, time = build_linked_trie(self.text)
        return time

In [20]:
def add_suffix(node, ind, text):
    length = 0
    edge = None
    for i in range(ind, len(text)):
        letter = text[i]
        if length == 0:
            if letter not in node.edges:
                node.edges[letter] = TrieNode(i)
                return
            edge = node.edges[letter]
        if text[edge.begin + length] != letter:
            break
        length += 1
        if length >= edge.length:
            node = edge
            length = 0
    else:
        return

    new_node = TrieNode(edge.begin, length)
    node.edges[text[edge.begin]] = new_node
    new_node.edges[text[edge.begin + length]] = edge
    new_node.edges[text[i]] = TrieNode(i)
    edge.begin = edge.begin + length
    edge.length -= length


@timeit
def build_suffix_tree(text):
    root = TrieNode()
    for i in range(len(text)):
        add_suffix(root, i, text)
    return root


def find_word(node, text, word):
    length = 0
    edge = None
    for letter in word:
        if length == 0:
            if letter not in node.edges:
                return False
            edge = node.edges[letter]
        if text[edge.begin + length] != letter:
            return False
        length += 1
        if length >= edge.length:
            node = edge
            length = 0
    return True


class SuffixTrie(Tree):
    def find_word(self, word):
        return find_word(self.root, self.text, word)

    def initiate_trie(self):
        self.root, time = build_suffix_tree(self.text)
        return time


In [21]:
def test_correctness(text, TreeType):
    tree = TreeType(text)
    build_time = tree.initiate_trie()
    print(f"Text size: {len(text)}, Build time: {build_time}")
    for i in range(100):
        left = randrange(len(text) - 1)
        right = randrange(left + 1, len(text))
        substring = text[left:right]
        if i % 2:
            m = len(substring) // 2
            substring = substring[:m] + 'a' + substring[m:]
        found = len(kmp(substring, text)) > 0
        assert tree.find_word(substring) == found, "Not OK"
    else:
        print("Tests passed\n")

In [23]:
for t in test_words:
    test_correctness(t, LinkedTrie)

Text size: 4, Build time: 0.0
Tests passed

Text size: 8, Build time: 0.0
Tests passed

Text size: 7, Build time: 0.0
Tests passed

Text size: 19, Build time: 0.0002
Tests passed

Text size: 101, Build time: 0.0071
Tests passed

Text size: 2482, Build time: 12.5232
Tests passed



In [24]:
for t in test_words:
    test_correctness(t, RegularTrie)

Text size: 4, Build time: 0.0
Tests passed

Text size: 8, Build time: 0.0
Tests passed

Text size: 7, Build time: 0.0
Tests passed

Text size: 19, Build time: 0.0002
Tests passed

Text size: 101, Build time: 0.0057
Tests passed

Text size: 2482, Build time: 7.1274
Tests passed



In [26]:
for t in test_words:
    test_correctness(t, SuffixTrie)

Text size: 4, Build time: 0.0
Tests passed

Text size: 8, Build time: 0.0
Tests passed

Text size: 7, Build time: 0.0
Tests passed

Text size: 19, Build time: 0.0001
Tests passed

Text size: 101, Build time: 0.0006
Tests passed

Text size: 2482, Build time: 0.0243
Tests passed



In [None]:
def ukkonnen(text):
    pass

In [None]:
# for t in test_words:
#     test_correctness(t, SuffixTree)