In [1]:
import openai
from openai import OpenAI
import pandas as pd
import time
from tqdm import tqdm
import numpy as np
client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [12]:
# Read the CSV files
old_testament_df = pd.read_csv('old_testament.csv')
new_testament_df = pd.read_csv('new_testament.csv')

# Initialize batch parameters
batch_size = 100
max_retries = 5

# Function to generate embeddings for a given DataFrame
def generate_embeddings(df, testament_name):
    verses = df['Verse Text'].tolist()
    embeddings = []
    print(f"\nGenerating embeddings for the {testament_name}...")
    for i in tqdm(range(0, len(verses), batch_size)):
        batch_verses = verses[i:i+batch_size]
        batch_verses = [text.replace("\n", " ") for text in batch_verses]
        retries = 0
        while True:
            try:
                response = client.embeddings.create(input = batch_verses, model="text-embedding-3-small") #.data[0].embedding
                # print(response)
                # response = client.Embedding.create(input=batch_verses, model="text-embedding-ada-002")
                batch_embeddings = [response.data[i].embedding for i, data_point in enumerate(response.data)]
                embeddings.extend(batch_embeddings)
                # time.sleep(0.5)  # Pause to respect rate limits
                break
            except openai.RateLimitError:
                retries += 1
                if retries > max_retries:
                    print(f"Exceeded maximum retries for batch starting at index {i}.")
                    raise
                wait_time = 2 ** retries
                print(f"Rate limit error. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            except Exception as e:
                print(f"An unexpected error occurred: {e}")
                raise
    return embeddings

In [14]:
# Generate embeddings for the Old Testament
old_testament_embeddings = generate_embeddings(old_testament_df, "Old Testament")

# Convert to NumPy array and save
old_embeddings_array = np.array(old_testament_embeddings / np.linalg.norm(old_testament_embeddings, axis=-1, keepdims=True))
np.save('normed_small_embeddings_ot.npy', old_embeddings_array)
print("\nOld Testament embeddings saved to 'normed_small_embeddings_ot.npy'.")


Old Testament embeddings saved to 'old_testament_embeddings.npy'.


In [13]:
# Generate embeddings for the New Testament
new_testament_embeddings = generate_embeddings(new_testament_df, "New Testament")

# Convert to NumPy array and save
new_embeddings_array = np.array(new_testament_embeddings / np.linalg.norm(new_testament_embeddings, axis=-1, keepdims=True))
np.save('normed_small_embeddings_nt.npy', new_embeddings_array)
print("\nNew Testament embeddings saved to 'normed_small_embeddings_nt.npy'.")


Generating embeddings for the New Testament...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [01:11<00:00,  1.11it/s]



New Testament embeddings saved to 'new_testament_embeddings.npy'.
