## Libraries

In [None]:
from pathlib import Path
import pandas as pd
import json
import numpy as np


In [None]:
import chromadb
from chromadb.config import Settings


In [None]:
from langchain.vectorstores import Chroma


In [None]:
from chromadb import Client


## Filepath

In [None]:
# Get the current working directory
cwd = Path.cwd()

# Get the parent of the working directory
parent_dir = cwd.parent

# Get the parent of the parent directory
grandparent_dir = parent_dir.parent

# Get path to /data/interim
data_interim_dir = grandparent_dir / Path('data') / Path('interim')

## Functions

In [None]:
def get_verses_per_book(data: dict, book_number: int) -> dict:
    # Get the name of the book from the dictionary
    book_text = data[book_number]['name']
    reference_list = [] # keys of output dictionary
    verses_list = [] # values of output dictionary
    # Iterate over all chapters in the dictionary
    for chapter_number, chapter in enumerate(data[book_number]['chapters']):
        # Get the chapter number as a string
        chapter_number_txt = str(chapter_number+1)
        # Iterate over all verses in the chapter using enumerate() function
        for verse_number, verse in enumerate(chapter):
            # Get the verse number as a string
            verse_number_text = str(verse_number+1)
            # Append book name, chapter number and verse number into a string
            reference_list.append(f"{book_text} {chapter_number_txt}:{verse_number_text}")
            # Append verse to list
            verses_list.append(verse)   
    return  dict(zip(reference_list, verses_list))

In [None]:
def get_verses_of_all_books(data: dict) -> list:
    bible_books = []
    for book_number, books in enumerate(data):
        book_verses = get_verses_per_book(data,book_number)
        bible_books.append(book_verses)
    return bible_books

## Execute

In [None]:
filepath = data_interim_dir / Path('en_bbe.json')

with open(filepath) as f:
    data = json.load(f)

In [None]:
bible_books= get_verses_of_all_books(data)

In [None]:
df_bible = pd.DataFrame.from_dict(bible_books).T

In [None]:
df_bible['verse'] = df_bible.apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)

In [None]:
df_bible_flat = df_bible.drop([col for col in df_bible.columns if col != 'verse'], axis=1)

### Test

In [None]:
df_bible.iat[0,0]

In [None]:
df_bible_flat.iat[0,0]

### ChromaDB Setup

In [None]:
filepath = str(data_interim_dir / Path('ChromaDB'))

chroma_client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                    persist_directory=filepath
                                ))

In [None]:
chroma_client.list_collections()

### Create Embeddings

#### Single Book

In [None]:
book_number = 0
book_name = data[book_number]['name'].lower().replace(" ", "")

# Specifying collection = chroma_client.get_or_create_collection(name=book_name,
# embedding_function='SentenceTransformerEmbeddingFunction') 
# creates query error

collection = chroma_client.get_or_create_collection(name=book_name)

df_book = df_bible.drop([col for col in df_bible.columns if col != book_number], axis=1).dropna().rename(columns = {book_number:'verse'})

"""
collection.add(
        documents = list(df_book['verse']),
        ids = list(df_book.index)
        )
"""

chroma_client.persist()

In [None]:
collection.peek()
collection.count()

In [None]:
query = ["Creation"]


result = collection.query(
    query_texts=query,
    n_results=10
)

In [None]:
result_df = pd.DataFrame.from_dict(result)
result_df = result_df.drop(['embeddings','metadatas'],axis=1)
result_df = result_df.explode(list(result_df.columns))
result_df.sort_values("distances", ascending=True)

In [None]:
for i in range(len(result_df)):
    reference = result_df.iat[i,0]
    verse = result_df.iat[i,1]
    print(f"{reference} - {verse}")

#### Bible

In [None]:
collection = chroma_client.get_or_create_collection(name="bible")

"""
collection.add(
        documents = list(df_bible_flat['verse']),
        ids = list(df_bible_flat.index)
        )

# Only have to persist once
chroma_client.persist()
"""


In [None]:
# collection.peek()
collection.count()

In [None]:
query = ["Live"]

result = collection.query(
    query_texts=query,
    n_results=7)

In [None]:
result_df = pd.DataFrame.from_dict(result)
result_df = result_df.drop(['embeddings','metadatas'],axis=1)
result_df = result_df.explode(list(result_df.columns))
result_df.sort_values("distances", ascending=True)

In [None]:
for i in range(len(result_df)):
    reference = result_df.iat[i,0]
    verse = result_df.iat[i,1]
    print(f"{reference} - {verse}")