In [1]:
import numpy as np
import pandas as pd
import random
import operator

from openai import OpenAI

from pathlib import Path

from chromadb.utils import embedding_functions
from chromadb import PersistentClient

from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

In [2]:
# Set up OpenAI API key if you intend to use it
# import os
# os.environ["OPENAI_API_KEY"] = YOUR OPENAI API KEY

In [3]:
# Get the home directory of the project
# If you encounter path issues, set this manually to the repo directory
home_dir = Path.cwd().parent
data_dir = home_dir / 'data'

In [4]:
# A handy function to sample the start, end, and random points in a list or df
def sample_df(x):
    if len(x) < 9:
        return x
    rnd_samp = sorted(random.sample(range(3, len(x)-3), 3))
    if isinstance(x, list):
        # Get the items by index from a list
        return operator.itemgetter(*np.r_[0:3,rnd_samp,-3:0])(x)
    else:
        return x.iloc[np.r_[0:3,rnd_samp,-3:0]]

In [5]:
# Load text
kjv_path = data_dir / 'kjv.tsv'
kjv = pd.read_csv(kjv_path, sep="\t", names=['ref','text'])

# Add the ref into the text
kjv['text'] = kjv['ref']+' '+kjv['text']

# set book, chapter, verse as ID
kjv.set_index('ref', drop=False, inplace=True)
kjv.index.name = None

# Get book, chapter, and verse for metadata
kjv[['book','cv']] = kjv['ref'].str.split(" ", n=1, expand=True)
kjv[['chapter','verse']] = kjv['cv'].str.split(":", n=1, expand=True)
kjv.drop(['cv','ref'], inplace=True, axis = 1)

# Get metadata dict
kjv_metadata = kjv[['book', 'chapter', 'verse']].to_dict('records')

In [6]:
# Overlap verses in chapters to enhance context
# each verse includes the previous and following verses (except for the first and last verse of a chapter)
kjv['prev'] = kjv.groupby(['book', 'chapter'])['text'].shift(1, fill_value='')
kjv['next'] = kjv.groupby(['book', 'chapter'])['text'].shift(-1, fill_value='')
kjv['text'] = kjv['prev'] + ' ' + kjv['text'] + ' ' + kjv['next']
kjv.drop(['prev', 'next'], axis=1, inplace=True)

In [7]:
# Sanity check inside df and dict
sample_df(kjv)

Unnamed: 0,text,book,chapter,verse
Genesis 1:1,Genesis 1:1 In the beginning God created the ...,Genesis,1,1
Genesis 1:2,Genesis 1:1 In the beginning God created the h...,Genesis,1,2
Genesis 1:3,"Genesis 1:2 And the earth was without form, an...",Genesis,1,3
1_Kings 7:27,1_Kings 7:26 And it [was] an hand breadth thic...,1_Kings,7,27
Isaiah 45:9,"Isaiah 45:8 Drop down, ye heavens, from above,...",Isaiah,45,9
Matthew 19:11,"Matthew 19:10 His disciples say unto him, If t...",Matthew,19,11
Revelation 22:19,Revelation 22:18 For I testify unto every man ...,Revelation,22,19
Revelation 22:20,Revelation 22:19 And if any man shall take awa...,Revelation,22,20
Revelation 22:21,Revelation 22:20 He which testifieth these thi...,Revelation,22,21


In [8]:
sample_df(kjv_metadata)

({'book': 'Genesis', 'chapter': '1', 'verse': '1'},
 {'book': 'Genesis', 'chapter': '1', 'verse': '2'},
 {'book': 'Genesis', 'chapter': '1', 'verse': '3'},
 {'book': 'Leviticus', 'chapter': '6', 'verse': '9'},
 {'book': 'Joshua', 'chapter': '10', 'verse': '8'},
 {'book': 'Psalm', 'chapter': '38', 'verse': '3'},
 {'book': 'Revelation', 'chapter': '22', 'verse': '19'},
 {'book': 'Revelation', 'chapter': '22', 'verse': '20'},
 {'book': 'Revelation', 'chapter': '22', 'verse': '21'})

In [9]:
# Set chroma path
chroma_path = data_dir / 'chroma'

# An embedding function to use with native chroma when creating the database and inserting records
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# The huggingface transformer
hf_sentence_transformer = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Instantiate chromadb instance, data is stored on disk.
chroma_client = PersistentClient(path=str(chroma_path))

# Use this to delete the database if need be
chroma_client.delete_collection(name="nrsvcv_Bible_v1")

# Create the collection (aka vector database) or if the database already exists, use it. Specify the model that we want to use to do the embedding.
collection = chroma_client.get_or_create_collection(name="nrsvcv_Bible_v1", embedding_function=sentence_transformer_ef)

In [11]:
# Split up dataframe or array into chunks to load into chroma (max records 5461 at a time)
def split_df_array(x, n: int) -> list:
    '''
    A function to split a dataframe or array into n chunks
    This replaces np.array_split, which will be deprecated for dataframes in future versions
    '''
    chunks = list()
    n = int(np.ceil(n))
    chunk_size = int(np.ceil(len(x) / n))
    for i in range(n):
        chunks.append(x[i*chunk_size:(i+1)*chunk_size])
    return chunks

n_chunks = int(np.ceil(len(kjv)/5400))
chunked_kjv = split_df_array(kjv, n_chunks)
chunked_kjv_metadata = split_df_array(kjv_metadata, n_chunks)

In [13]:
# Add all the data to the vector database in batches.
# ChromaDB automatically converts and stores the text as vector embeddings. This may take a moment (10-15 mins on my very low spec machine with a passable SSD).
for chunk in range(int(n_chunks)):
    print(f"Adding chunk: {chunk+1} of {n_chunks}.")
    collection.upsert(
        ids=chunked_kjv[chunk].index.to_list(),
        documents=chunked_kjv[chunk]['text'].to_list(),
        metadatas=chunked_kjv_metadata[chunk],
    )

Adding chunk: 1 of 6.
Adding chunk: 2 of 6.
Adding chunk: 3 of 6.
Adding chunk: 4 of 6.
Adding chunk: 5 of 6.
Adding chunk: 6 of 6.


In [15]:
# Test query the vector database

# This is only doing a similarity search, not using ANY LLMs
results = collection.query(
    query_texts=["in the garden of Eden"],
    n_results=3,
    include=['documents', 'distances', 'metadatas']
)

results

{'ids': [['Genesis 3:24', 'Genesis 3:23', 'Genesis 3:8']],
 'distances': [[0.8362309336662292, 0.8863940834999084, 0.9025758504867554]],
 'metadatas': [[{'book': 'Genesis', 'chapter': '3', 'verse': '24'},
   {'book': 'Genesis', 'chapter': '3', 'verse': '23'},
   {'book': 'Genesis', 'chapter': '3', 'verse': '8'}]],
 'embeddings': None,
 'documents': [['Genesis 3:23 Therefore the LORD God sent him forth from the garden of Eden, to till the ground from whence he was taken. Genesis 3:24 So he drove out the man; and he placed at the east of the garden of Eden Cherubims, and a flaming sword which turned every way, to keep the way of the tree of life. ',
   'Genesis 3:22 And the LORD God said, Behold, the man is become as one of us, to know good and evil: and now, lest he put forth his hand, and take also of the tree of life, and eat, and live for ever: Genesis 3:23 Therefore the LORD God sent him forth from the garden of Eden, to till the ground from whence he was taken. Genesis 3:24 So he d

In [16]:
# Load the chroma client and sentence transformer for use in a Langchain chain
chroma_db = Chroma(
    client=chroma_client,
    embedding_function=hf_sentence_transformer,
    collection_name="nrsvcv_Bible_v1")

In [17]:
query = "How old was jesus, citing the book, chapter, and verse"

In [18]:
# Test query, again this only uses vector similarity
chroma_db.similarity_search_with_score(query)

[(Document(page_content='Luke 3:23 And Jesus himself began to be about thirty years of age, being (as was supposed) the son of Joseph, which was [the son] of Heli, Luke 3:24 Which was [the son] of Matthat, which was [the son] of Levi, which was [the son] of Melchi, which was [the son] of Janna, which was [the son] of Joseph, Luke 3:25 Which was [the son] of Mattathias, which was [the son] of Amos, which was [the son] of Naum, which was [the son] of Esli, which was [the son] of Nagge,', metadata={'book': 'Luke', 'chapter': '3', 'verse': '24'}),
  0.8709121942520142),
 (Document(page_content='Luke 2:40 And the child grew, and waxed strong in spirit, filled with wisdom: and the grace of God was upon him. Luke 2:41 Now his parents went to Jerusalem every year at the feast of the passover. Luke 2:42 And when he was twelve years old, they went up to Jerusalem after the custom of the feast.', metadata={'book': 'Luke', 'chapter': '2', 'verse': '41'}),
  0.8860723972320557),
 (Document(page_con

In [None]:
# Load the chatGPT model
gpt35_model = ChatOpenAI(
    model_name="gpt-3.5-turbo-1106",
    temperature=0.8
)

In [None]:
def orig_chatgpt_call(prompt, model="gpt-3.5-turbo"):
   
   # This will be run every time the function is run, it's only designed as a test as this is not great practice :)
   client = OpenAI()

   response = client.chat.completions.create(
    model=model,
    messages=[{"role": "user", "content": prompt}]
)

   return response.choices[0].message.content


def rag_chatgpt_call(prompt, model="gpt-3.5-turbo"):
   chain = RetrievalQA.from_chain_type(
      llm=gpt35_model,
      chain_type="stuff",
      retriever=chroma_db.as_retriever()
    )
   
   return chain.invoke(prompt)['result']

In [None]:
orig_chatgpt_call(query)

'The Bible does not provide a specific age for Jesus at the time of his death. However, according to historical accounts, scholars generally believe that Jesus was crucified around the age of 33.'

In [None]:
rag_chatgpt_call(query)

'In Luke 3:23, it is mentioned that Jesus began to be about thirty years of age.'