In [2]:
import csv
import json
import os

from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import openai
import dotenv


dotenv.load_dotenv()

# Set up OpenAI client
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
azure_credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2023-07-01-preview",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider,
)

def get_embeddings(words):
    """Calculate embeddings using OpenAI in a batch (all words at once)"""
    word_vectors = {}
    embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT, input=words)
    for word, embedding_object in zip(words, embeddings_response.data):
        word_vectors[word] = embedding_object.embedding
    return word_vectors

In [3]:
# Open most-common-nouns-english.csv and read the first column as words
words = []
with open('most-common-nouns-english.csv') as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        words.append(row[0])

# Calculate embeddings for all words
word_vectors = get_embeddings(words)

# Save embeddings to a file
with open('vectors_openai_ada.json', 'w') as f:
    json.dump(word_vectors, f)