In [None]:
from os import path
from typing import Union

In [None]:
from bs4 import BeautifulSoup

In [117]:
class BibleReader:
    def __init__(self, file_name:Union[str, None]=None):
        if file_name is None:
            file_name = path.join('data', 'bible-csp.xml')
        data = None
        with open(file_name, encoding='utf-8') as file:
            data = file.read()
        bs_data = BeautifulSoup(data, 'xml')
        self.sentences = []
        buffer = ''
        for verse in bs_data.find_all('v'):
            verse = verse.find_all(text=True, recursive=False)
            verse = str(verse)[2:-2]
            if '.' not in verse:
                buffer += verse
            else:
                sentences = (buffer + ' ' + verse).split('.')
                buffer = ''
                sentence_number = len(sentences)
                added_sentences_count = 0
                for i in (range(sentence_number) if '.' in sentences[sentence_number - 1] else range(sentence_number - 1)):
                    sentence = sentences[i].lower()
                    words = [''.join([character for character in word if character.isalpha()]) for word in sentence.split(' ') if word != '']
                    words = [word for word in words if word != '']
                    self.sentences.append(words)
                    added_sentences_count += 1
                if added_sentences_count != sentence_number:
                    buffer += sentences[sentence_number - 1]
            

In [118]:
br = BibleReader()

In [123]:
class BibleBidirectorialDictionary:
    def __init__(self, data: BibleReader):
        self.ordered = []
        for sentence in data.sentences:
            for word in sentence:
                i = -1
                found = False
                beginning, end = 0, len(self.ordered) - 1
                while end - beginning > 0:
                    if (end - beginning) % 2 == 0:
                        current_visited_index = beginning + (end - beginning) // 2
                    else:
                        current_visited_index = beginning + (end - beginning) // 2 + 1
                    current_visited = self.ordered[current_visited_index]
                    if word == current_visited:
                        i = current_visited_index
                        found = True
                        break
                    elif word < current_visited:
                        end -= (end - current_visited_index) + 1
                        i = current_visited_index
                    else:
                        beginning += (current_visited_index - beginning) + 1
                        i = current_visited_index + 1
                else:
                    if len(self.ordered) == 0:
                        i = 0
                    elif len(self.ordered) == 1:
                        i = 0 if word < self.ordered[0] else 1
                    found = False
                if not found:
                    self.ordered.insert(i, word)

    def index_of_word(self, word:str) -> int:
        beginning, end = 0, len(self.ordered) - 1
        while end - beginning > 0:
            if (end - beginning) % 2 == 0:
                current_visited_index = beginning + (end - beginning) // 2
            else:
                current_visited_index = beginning + (end - beginning) // 2 + 1
            current_visited = self.ordered[current_visited_index]
            if word == current_visited:
                return beginning + (end - beginning) // 2
            elif word < current_visited:
                end -= (end - current_visited_index) + 1
            else:
                beginning += (current_visited_index - beginning) + 1
        return -1
            
        

In [124]:
bbd = BibleBidirectorialDictionary(br)

In [81]:
import json

In [125]:
with open(path.join('data', 'words.json'), 'w') as file:
    file.write(json.dumps(bbd.ordered))

In [109]:
bbd.index_of_word('a')

2

In [116]:
br.sentences

[['na', 'počátku', 'stvořil', 'bůh', 'nebesa', 'a', 'zemi'],
 ['země',
  'byla',
  'pustá',
  'a',
  'prázdná',
  'temnota',
  'byla',
  'nad',
  'hlubinou',
  'a',
  'duch',
  'boží',
  'se',
  'vznášel',
  'nad',
  'vodami'],
 ['i', 'řekl', 'bůh', 'budiž', 'světlo', 'a', 'bylo', 'světlo'],
 ['bůh',
  'viděl',
  'že',
  'světlo',
  'je',
  'dobré',
  'a',
  'oddělil',
  'bůh',
  'světlo',
  'od',
  'tmy'],
 ['bůh',
  'nazval',
  'světlo',
  'dnem',
  'a',
  'tmu',
  'nazval',
  'nocí',
  'a',
  'byl',
  'večer',
  'a',
  'bylo',
  'ráno',
  'jeden',
  'den'],
 ['i',
  'řekl',
  'bůh',
  'budiž',
  'klenba',
  'uprostřed',
  'vod',
  'a',
  'nechť',
  'odděluje',
  'vody',
  'od',
  'vod'],
 ['bůh',
  'tedy',
  'udělal',
  'klenbu',
  'a',
  'oddělil',
  'vody',
  'které',
  'byly',
  'pod',
  'klenbou',
  'od',
  'vod',
  'které',
  'byly',
  'nad',
  'klenbou'],
 ['a', 'stalo', 'se', 'tak'],
 ['bůh',
  'nazval',
  'klenbu',
  'nebesy',
  'a',
  'byl',
  'večer',
  'a',
  'bylo',
  'r