# Prep OpenAI embeddings

This notebook uses OpenAI models to generate embeddings for a set of common English words. 

In [3]:
import csv
import logging
import json
import os

from azure.identity import AzureCliCredential, get_bearer_token_provider
import openai
import dotenv

logging.basicConfig(level=logging.INFO)
dotenv.load_dotenv()

# Set up OpenAI client
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
AZURE_OPENAI_EMBEDDING_DIMENSIONS = os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS") 
azure_credential = AzureCliCredential()
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2023-05-15",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_ad_token_provider=token_provider,
)

def get_embeddings(words):
    """Calculate embeddings using OpenAI in a batch (all words at once)"""
    word_vectors = {}
    dimensions_args = (
        {"dimensions": int(AZURE_OPENAI_EMBEDDING_DIMENSIONS)}
        if AZURE_OPENAI_EMBEDDING_DIMENSIONS
        else {}
    )

    embeddings_response = openai_client.embeddings.create(
        model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
        input=words,
        **dimensions_args
    )
    for word, embedding_object in zip(words, embeddings_response.data):
        word_vectors[word] = embedding_object.embedding
    return word_vectors

In [None]:
# Open openai_movies.json, which is a dict of movie titles to movie embeddings
# And compute new embeddings
from tqdm.notebook import tqdm

new_movie_vectors = {}
with open("embeddings/movies_text-embedding-ada-002.json") as f:
    disney_vectors = json.load(f)
    for movie, embedding in tqdm(disney_vectors.items(), desc="Computing new embeddings"):
        new_movie_vectors[movie] = get_embeddings([movie])[movie]
# Write new embeddings to openai_movies
filename = f"openai_movies_{AZURE_OPENAI_EMBEDDING_DEPLOYMENT}-{AZURE_OPENAI_EMBEDDING_DIMENSIONS}.json"
with open(filename, "w") as f:
    json.dump(new_movie_vectors, f, indent=4)

Computing new embeddings:   0%|          | 0/573 [00:00<?, ?it/s]

INFO:azure.identity._internal.decorators:AzureCliCredential.get_token succeeded
INFO:httpx:HTTP Request: POST https://cog-rnrzqguit6q3g.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://cog-rnrzqguit6q3g.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://cog-rnrzqguit6q3g.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://cog-rnrzqguit6q3g.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://cog-rnrzqguit6q3g.openai.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://cog-rnrzqguit6q3g.openai.azure.com/openai/deplo

In [None]:
# Open most-common-nouns-english.csv and read the first column as words
words = []
with open('embeddings/most-common-nouns-english.csv') as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        words.append(row[0])

# Calculate embeddings for all words
word_vectors = get_embeddings(words)

# Save embeddings to a file
filename = f"embeddings/words_{AZURE_OPENAI_EMBEDDING_DEPLOYMENT}-{AZURE_OPENAI_EMBEDDING_DIMENSIONS}.json"

with open(filename, 'w') as f:
    json.dump(word_vectors, f, indent=4)


INFO:azure.identity._internal.decorators:AzureCliCredential.get_token succeeded
DEBUG:azure.identity._internal.decorators:[Authenticated account] Client ID: 04b07795-8ddb-461a-bbee-02f9e1bf7b46. Tenant ID: e47e6fc9-3a2c-454a-8b8f-90cc6972fb77. User Principal Name: unavailableUpn. Object ID (user): ec08831b-4ffa-4864-9a18-520d7fda8a3f
DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/deployments/text-embedding-3-small/embeddings', 'headers': {'Authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6ImltaTBZMnowZFlLeEJ0dEFxS19UdDVoWUJUayIsImtpZCI6ImltaTBZMnowZFlLeEJ0dEFxS19UdDVoWUJUayJ9.eyJhdWQiOiJodHRwczovL2NvZ25pdGl2ZXNlcnZpY2VzLmF6dXJlLmNvbSIsImlzcyI6Imh0dHBzOi8vc3RzLndpbmRvd3MubmV0L2U0N2U2ZmM5LTNhMmMtNDU0YS04YjhmLTkwY2M2OTcyZmI3Ny8iLCJpYXQiOjE3NDEwNjg1ODEsIm5iZiI6MTc0MTA2ODU4MSwiZXhwIjoxNzQxMDczMTgyLCJhY3IiOiIxIiwiYWlvIjoiQVZRQnEvNFpBQUFBanNkMmozSjNCeTNzcnFMTGxqM0t6aWR2VHpkY2ZCZ3hEK3EvSlpMbFVnazNpSXhDbXI1Ujlya2k3MTc3UVhCRkZqblp2TTV5Z3ZNck5NYjBsemJPUGR