In [1]:
from openai import OpenAI
import pandas as pd
import time
import numpy as np
from scipy.spatial.distance import cosine
import pandas as pd
import ast

client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [2]:
# Load CSV into a DataFrame
df = pd.read_csv('book_of_mormon_with_embeddings.csv')

# Display the first few rows of the DataFrame
print(df.head())

   Unnamed: 0     Book  Chapter  Verse Number  \
0           0  1 Nephi        1             1   
1           1  1 Nephi        1             2   
2           2  1 Nephi        1             3   
3           3  1 Nephi        1             4   
4           4  1 Nephi        1             5   

                                          Verse Text  \
0  I, Nephi, having been born of goodly parents, ...   
1  Yea, I make a record in the language of my fat...   
2  And I know that the record which I make is tru...   
3  For it came to pass in the commencement of the...   
4  Wherefore it came to pass that my father, Lehi...   

                                     small_embedding  
0  [0.022863460704684258, -0.0034764057490974665,...  
1  [0.014547476544976234, 0.026577120646834373, 0...  
2  [0.02322634868323803, 0.01963120698928833, 0.0...  
3  [0.0457134023308754, 0.023395659402012825, 0.0...  
4  [0.027752844616770744, -0.02123965509235859, -...  


## Convert the embeddings to a large numpy array

In [3]:
df['small_embedding'] = df['small_embedding'].apply(ast.literal_eval)
df['small_embedding'] = df['small_embedding'].apply(np.array)
small_embeddings = np.vstack(df["small_embedding"])
normed_small_embeddings = small_embeddings / np.linalg.norm(small_embeddings, axis=-1, keepdims=True)
normed_small_embeddings.shape

(6549, 1536)

In [8]:
np.save("normed_small_embeddings_bom.npy", normed_small_embeddings)

In [9]:
np.load("normed_small_embeddings_bom.npy")

array([[ 0.02286346, -0.00347641,  0.03435071, ..., -0.03299346,
        -0.02166661, -0.02224653],
       [ 0.01454748,  0.02657712,  0.01834966, ..., -0.0230547 ,
        -0.00166107, -0.03130759],
       [ 0.02322635,  0.01963121,  0.01236062, ..., -0.002928  ,
        -0.00654785, -0.03301107],
       ...,
       [ 0.01357608,  0.03861549, -0.0345573 , ...,  0.0188057 ,
        -0.02581338,  0.00472234],
       [ 0.03801824,  0.05088155, -0.02938144, ...,  0.01178116,
        -0.05725195, -0.00396874],
       [ 0.03645878, -0.02000522,  0.03394605, ...,  0.0200173 ,
         0.02807495, -0.01213481]])

In [4]:
normed_small_embeddings = small_embeddings / np.linalg.norm(small_embeddings, axis=-1, keepdims=True)

## How to use the function

In [5]:
def find_most_similar_verses(input_text, topk=10, comparison_embeddings=None, df=None):
    new_embedding = get_embedding(input_text)
    similarities = normed_small_embeddings@(np.array(new_embedding) / np.linalg.norm(new_embedding))
    sorted_similarities = sorted(similarities, reverse=True)
    sorted_indices = np.argsort(similarities)[::-1][:topk]

    for idx in sorted_indices:
        result = df.iloc[idx]
        print(f"Book: {result[1]}, Chapter: {result[2]}, Verse: {result[3]}")
        print(f"{result[4]}")
        print(f"Similarity Score: {similarities[idx]}")
        print("\n")
    

In [16]:
input_text = "Jesus Christ"
find_most_similar_verses(input_text, topk=20, comparison_embeddings=normed_small_embeddings, df=df)

Book: 3 Nephi, Chapter: 11, Verse: 10
Behold, I am Jesus Christ, whom the prophets testified shall come into the world.
Similarity Score: 0.4873034943014518


Book: 3 Nephi, Chapter: 9, Verse: 15
Behold, I am Jesus Christ the Son of God. I created the heavens and the earth, and all things that in them are. I was with the Father from the beginning. I am in the Father, and the Father in me; and in me hath the Father glorified his name.
Similarity Score: 0.4446989120458757


Book: Ether, Chapter: 3, Verse: 14
Behold, I am he who was prepared from the foundation of the world to redeem my people. Behold, I am Jesus Christ. I am the Father and the Son. In me shall all mankind have life, and that eternally, even they who shall believe on my name; and they shall become my sons and my daughters.
Similarity Score: 0.43545436099258766


Book: 2 Nephi, Chapter: 25, Verse: 29
And now behold, I say unto you that the right way is to believe in Christ, and deny him not; and Christ is the Holy One of I

  print(f"Book: {result[1]}, Chapter: {result[2]}, Verse: {result[3]}")
  print(f"{result[4]}")


In [None]:
# Example of how to use the function
# Assuming 'df' is your dataframe loaded with embeddings
input_text = "I make a record in the language of my fathers"
top_k_verses = find_most_similar_verses(df, input_text, top_k=3)

# Print the results
for result in top_k_verses:
    print(f"Book: {result[0]}, Chapter: {result[1]}, Verse: {result[2]}")
    print(f"Text: {result[3]}")
    print(f"Similarity Score: {result[4]}")
    print("\n")

In [11]:
# Function to compute cosine similarity
def cosine_similarity(embedding1, embedding2):
    return 1 - cosine(embedding1, embedding2)

# Function to find the top-k most similar verses
def find_most_similar_verses(df, input_text, top_k=5):
    # Get the embedding for the input text
    new_embedding = get_embedding(input_text)
    
    # Initialize a list to store the results
    similarities = []
    
    # Iterate through the dataframe and calculate cosine similarity
    for idx, row in df.iterrows():
        print(row['small
        verse_embedding = np.array(row['small_embedding'])
        similarity = cosine_similarity(new_embedding, verse_embedding)
        similarities.append((row['Book'], row['Chapter'], row['Verse Number'], row['Verse Text'], similarity))
    
    # Sort the results based on similarity
    sorted_similarities = sorted(similarities, key=lambda x: x[4], reverse=True)
    
    # Return the top-k most similar verses
    return sorted_similarities[:top_k]

In [None]:
for idx, row in df.iterrows():
    verse_embedding = np.array(row['small_embedding'])
    print(verse_embedding)

In [12]:
# Example of how to use the function
# Assuming 'df' is your dataframe loaded with embeddings
input_text = "I make a record in the language of my fathers"
top_k_verses = find_most_similar_verses(df, input_text, top_k=3)

# Print the results
for result in top_k_verses:
    print(f"Book: {result[0]}, Chapter: {result[1]}, Verse: {result[2]}")
    print(f"Text: {result[3]}")
    print(f"Similarity Score: {result[4]}")
    print("\n")

ValueError: Input vector should be 1-D.