## Libraries

In [1]:
from pathlib import Path
import pandas as pd
import json
import numpy as np


In [2]:
import chromadb
from chromadb.config import Settings


In [3]:
from chromadb import Client


## Filepath

In [4]:
# Get the current working directory
cwd = Path.cwd()

# Get the parent of the working directory
parent_dir = cwd.parent

# Get the parent of the parent directory
grandparent_dir = parent_dir.parent

# Get path to /data/interim
data_interim_dir = grandparent_dir / Path('data') / Path('interim')

## Functions

In [5]:
def get_verses_per_book(data: dict, book_number: int) -> dict:
    # Get the name of the book from the dictionary
    book_text = data[book_number]['name']
    reference_list = [] # keys of output dictionary
    verses_list = [] # values of output dictionary
    # Iterate over all chapters in the dictionary
    for chapter_number, chapter in enumerate(data[book_number]['chapters']):
        # Get the chapter number as a string
        chapter_number_txt = str(chapter_number+1)
        # Iterate over all verses in the chapter using enumerate() function
        for verse_number, verse in enumerate(chapter):
            # Get the verse number as a string
            verse_number_text = str(verse_number+1)
            # Append book name, chapter number and verse number into a string
            reference_list.append(f"{book_text} {chapter_number_txt}:{verse_number_text}")
            # Append verse to list
            verses_list.append(verse)   
    return  dict(zip(reference_list, verses_list))

In [6]:
def get_verses_of_all_books(data: dict) -> list:
    bible_books = []
    for book_number, books in enumerate(data):
        book_verses = get_verses_per_book(data,book_number)
        bible_books.append(book_verses)
    return bible_books

## Execute

In [53]:
filepath = data_interim_dir / Path('en_bbe.json')

with open(filepath) as f:
    data = json.load(f)

In [8]:
bible_books= get_verses_of_all_books(data)

In [9]:
df_bible = pd.DataFrame.from_dict(bible_books).T

In [10]:
df_bible['verse'] = df_bible.apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)

In [28]:
df_bible_flat = df_bible.drop([col for col in df_bible.columns if col != 'verse'], axis=1)

### Test

In [12]:
book_number = 0
data[book_number]
print("Book type")
print(type(data[book_number]))
book_chapters = data[book_number]['chapters']
print("Book chapter type")
print(type(book_chapters))
print("Book chapters in book")
print(len(book_chapters))

Book type
<class 'dict'>
Book chapter type
<class 'list'>
Book chapters in book
50


In [13]:
len(book_chapters[1])

25

In [14]:
df_bible.iat[0,0]

'At the first God made the heaven and the earth.'

In [15]:
df_bible_flat.iat[0,0]

'At the first God made the heaven and the earth.'

### ChromaDB Setup

In [134]:
filepath = str(data_interim_dir / Path('ChromaDB'))

chroma_client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                    persist_directory=filepath
                                ))

Using embedded DuckDB with persistence: data will be stored in: c:\Users\Admin\Documents\Github\scripture-semantic-search\data\interim\ChromaDB


In [17]:
chroma_client.list_collections()

No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction
  from .autonotebook import tqdm as notebook_tqdm
No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction
No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


[Collection(name=bible), Collection(name=3john), Collection(name=genesis)]

### Create Embeddings

#### Bible

In [None]:
collection = chroma_client.get_or_create_collection(name="bible")

"""
collection.add(
        documents = list(df_bible_flat['verse']),
        ids = list(df_bible_flat.index)
        )

# Only have to persist once
chroma_client.persist()
"""


In [None]:
# collection.peek()
collection.count()

In [None]:
query = ["Create"]

result = collection.query(
    query_texts=query,
    n_results=7)

In [None]:
result_df = pd.DataFrame.from_dict(result)
result_df = result_df.drop(['embeddings','metadatas'],axis=1)
result_df = result_df.explode(list(result_df.columns))
result_df.sort_values("distances", ascending=True)

In [None]:
for i in range(len(result_df)):
    reference = result_df.iat[i,0]
    verse = result_df.iat[i,1]
    print(f"{reference} - {verse}")

### Update Embeddings

If an id is not found in the collection, an exception will be raised. If documents are supplied without corresponding embeddings, the embeddings will be recomupted with the collection's embedding function.

#### Single Book - Single Chapter

In [None]:
book_number = 63
book_name = data[book_number]['name'].lower().replace(" ", "")
book_chapters = data[book_number]['chapters']

df_book = df_bible.drop([col for col in df_bible.columns if col != book_number], axis=1).dropna().rename(columns = {book_number:'verse'})

In [None]:
collection = chroma_client.get_or_create_collection(name=book_name)

result = collection.get(
    ids=list(df_book.index),
    include=["embeddings","documents","metadatas"]
)

result_df = pd.DataFrame.from_dict(result)

In [None]:
book_embeddings = list(result_df.embeddings)
book_documents = list(df_book['verse'])

meta_dict = {"book":book_name, "chapter":str(len(book_chapters))}
book_chapter_verses = len(book_chapters[0])

book_chapter_metadata = [meta_dict] * book_chapter_verses

In [None]:
print(len(book_embeddings))
print(len(book_documents))
print(len(book_chapter_metadata))

In [None]:
collection.update(
    ids=list(df_book.index),
    embeddings=book_embeddings,
    documents =book_documents,
    metadatas=book_chapter_metadata,
)

chroma_client.persist()

In [None]:
result = collection.get(
    ids=list(df_book.index),
    include=["embeddings","documents","metadatas"]
)

pd.DataFrame.from_dict(result)

#### Single Book - Multiple Chapters

In [None]:
book_number = 0
book_name = data[book_number]['name'].lower().replace(" ", "")
book_chapters = data[book_number]['chapters']

df_book = df_bible.drop([col for col in df_bible.columns if col != book_number], axis=1).dropna().rename(columns = {book_number:'verse'})

In [None]:
collection = chroma_client.get_or_create_collection(name=book_name)

result = collection.get(
    ids=list(df_book.index),
    include=["embeddings","documents","metadatas"]
)

result_df = pd.DataFrame.from_dict(result)
result_df.head()

In [None]:
book_embeddings = list(result_df.embeddings)
book_documents = list(df_book['verse'])
book_ids = list(df_book.index)

In [None]:
book_meta_dict_list = []
for i in range(len(book_chapters)):
    meta_dict = {"book":book_name, "chapter":str(i+1)}
    book_chapter_verses = len(book_chapters[i])
    book_meta_dict_list.append([meta_dict] * book_chapter_verses)

book_meta_dict_list = [item for sublist in book_meta_dict_list for item in sublist]


In [None]:
print(len(book_embeddings))
print(len(book_documents))
print(len(book_meta_dict_list))

In [None]:
collection.update(
    ids=book_ids,
    embeddings=book_embeddings,
    documents =book_documents,
    metadatas=book_meta_dict_list,
)

chroma_client.persist()

In [None]:
result = collection.get(
    ids=list(df_book.index),
    include=["embeddings","documents","metadatas"]
)

result_df = pd.DataFrame.from_dict(result)
result_df.head()

#### Bible

In [None]:
collection = chroma_client.get_or_create_collection(name='bible')

result = collection.get(
    ids=list(df_bible_flat.index),
    include=["embeddings","documents","metadatas"]
)

result_df = pd.DataFrame.from_dict(result)

In [None]:
bible_embeddings = list(result_df.embeddings)
bible_documents = list(df_bible_flat['verse'])
bible_ids = list(df_bible_flat.index)

In [None]:
bible_meta_dict_list = []

for x in range(len(data)):
    book_chapters = data[x]['chapters']
    book_name = data[x]['name'].lower().replace(" ", "")
    book_meta_dict_list = []
    for i in range(len(book_chapters)):
        meta_dict = {"book":book_name, "chapter":str(i+1)}
        book_chapter_verses = len(book_chapters[i])
        book_meta_dict_list.append([meta_dict] * book_chapter_verses)
    book_meta_dict_list = [item for sublist in book_meta_dict_list for item in sublist]
    bible_meta_dict_list.append(book_meta_dict_list)

bible_meta_dict_list = [item for sublist in bible_meta_dict_list for item in sublist]

In [None]:
print(len(bible_embeddings))
print(len(bible_documents))
print(len(bible_meta_dict_list))

In [None]:
collection.update(
    ids=bible_ids,
    embeddings=bible_embeddings,
    documents =bible_documents,
    metadatas=bible_meta_dict_list,
)

chroma_client.persist()

In [None]:
result = collection.get(
    ids=bible_ids,
    include=["embeddings","documents","metadatas"]
)

result_df = pd.DataFrame.from_dict(result)
result_df

### Recalculate Embeddings

#### Bible - Specific Book

In [11]:
book_number = 43
book_name = data[book_number]['name'].lower().replace(" ", "")
print(book_name)

acts


In [12]:
book_chapters = data[book_number]['chapters']

df_book = df_bible.drop([col for col in df_bible.columns if col != book_number], axis=1).dropna().rename(columns = {book_number:'verse'})

In [13]:
bible_book_ids = list(df_book.index)
bible_book_documents = list(df_book['verse'])

In [15]:
print(bible_book_documents)

['I have given an earlier account, O Theophilus, of all the things which Jesus did, and of his teaching from the first,', 'Till the day when he was taken up to heaven after he had given his orders, through the Holy Spirit, to the Apostles of whom he had made selection:', 'And to whom he gave clear and certain signs that he was living, after his death; for he was seen by them for forty days, and gave them teaching about the kingdom of God:', 'And when they were all together, with him, he gave them orders not to go away from Jerusalem, but to keep there, waiting till the word of the Father was put into effect, of which, he said, I have given you knowledge:', 'For the baptism of John was with water, but you will have baptism with the Holy Spirit, after a little time.', 'So, when they were together, they said to him, Lord, will you at this time give back the kingdom to Israel?', 'And he said to them, It is not for you to have knowledge of the time and the order of events which the Father h

In [129]:
bible_book_meta_dict_list = []
for i in range(len(book_chapters)):
    meta_dict = {"book":book_name, "chapter":str(i+1)}
    book_chapter_verses = len(book_chapters[i])
    bible_book_meta_dict_list.append([meta_dict] * book_chapter_verses)

bible_book_meta_dict_list = [item for sublist in bible_book_meta_dict_list for item in sublist]

In [130]:
bible_book_meta_dict_list

[{'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revelation', 'chapter': '1'},
 {'book': 'Revel

In [135]:
collection = chroma_client.get_or_create_collection(name='bible')

No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction
  from .autonotebook import tqdm as notebook_tqdm


In [23]:
collection.update(
    ids=bible_book_ids,
    documents =bible_book_documents,
    metadatas=bible_book_meta_dict_list,
)

chroma_client.persist()

### Input Document

#### Find Empty Documents

In [99]:
def find_indices(list_to_check,item_to_find):
    return [idx for idx, value in enumerate(list_to_check) if value == item_to_find]

In [120]:
empty_documents_ref = []
value='[]'

for book_n in range(len(data)):
    book_n - 1
    book = data[book_n]['chapters']
    book_name =  data[book_n]['name']
    for chapt_n in range(len(book)):
        chapt_n - 1
        chapter = data[book_n]['chapters'][chapt_n]
        empty_chapt_documents = find_indices(chapter,value)
        if empty_chapt_documents != []:
            empty_documents_ref.append(f"{book_name} {chapt_n}:{empty_chapt_documents}")


In [121]:
print(empty_documents_ref)

['Matthew 16:[20]', 'Matthew 17:[10]', 'Matthew 22:[13]', 'Mark 6:[15]', 'Mark 8:[43, 45]', 'Mark 10:[25]', 'Mark 14:[27]', 'Luke 16:[35]', 'Luke 22:[16]', 'John 4:[3]', 'Acts 7:[36]', 'Acts 14:[33]', 'Acts 23:[6]', 'Acts 27:[28]', 'Romans 15:[23]']


In [123]:
empty_documents_ids = [ 'Matthew 16:20',
                        'Matthew 17:10',
                        'Matthew 22:13',
                        'Mark 6:15',
                        'Mark 10:25',
                        'Mark 14:27',
                        'Luke 22:16',
                        'John 4:3',
                        'Acts 7:36',
                        'Acts 23:6',
                        'Acts 27:28',
                        'Romans 15:23'
                        ]

In [124]:
empty_documents_verses = [  'Then he gave orders to the disciples to give no man word that he was the Christ.',
                            'And his disciples, questioning him, said, Why then do the scribes say that Elijah has to come first?',
                            'Then the king said to the servants, Put cords round his hands and feet and put him out into the dark; there will be weeping and cries of sorrow.',
                            'But others said, It is Elijah. And others said, It is a prophet, even like one of the prophets.',
                            'It is easier for a camel to go through the eye of a needle than for someone who is rich to enter the kingdom of God.',
                            'And Jesus said to them, You will all be turned away from me: for it is in the Writings, I will put the keeper of the sheep to death, and the sheep will be put to flight.',
                            ' For I tell you, I will not eat it again until it finds fulfillment in the kingdom of God.',
                            'He went out of Judaea into Galilee again',
                            'This man took them out, having done wonders and signs in Egypt and in the Red Sea and in the waste land, for forty years.',
                            'Then Paul, knowing that some of them were Sadducees and the others Pharisees, called out in the Sanhedrin, “My brothers, I am a Pharisee, descended from Pharisees. I stand on trial because of the hope of the resurrection of the dead.',
                            'And they let down the lead, and saw that the sea was a hundred and twenty feet deep; and after a little time they did it again and it was ninety feet.',
                            'But now, having no longer any place in these parts and having had for a number of years a great desire to come to you,'
                        ]

In [131]:
empty_documents_metadata =[ {'book': 'Matthew', 'chapter': '16'},
                            {'book': 'Matthew', 'chapter': '17'},
                            {'book': 'Matthew', 'chapter': '22'},
                            {'book': 'Mark', 'chapter': '6'},
                            {'book': 'Mark', 'chapter': '10'},
                            {'book': 'Mark', 'chapter': '14'},
                            {'book': 'Luke', 'chapter': '22'},
                            {'book': 'John', 'chapter': '4'},
                            {'book': 'Acts', 'chapter': '7'},
                            {'book': 'Acts', 'chapter': '23'},
                            {'book': 'Acts', 'chapter': '27'},
                            {'book': 'Romans', 'chapter': '15'},
                            ]

In [132]:
len(empty_documents_ids) == len(empty_documents_verses) == len(empty_documents_metadata)

True

In [127]:
delete_ids = ['Luke 16:35','Mark 8:43','Mark 8:45','Acts 14:33']

In [136]:
collection.delete(
    ids=delete_ids
)

[]

In [137]:
collection.update(
    ids=empty_documents_ids,
    documents =empty_documents_verses,
    metadatas=empty_documents_metadata,
)