### Set up

In [1]:
import json
import os

import azure.identity
import dotenv
import numpy as np
import openai
import pandas as pd

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_ADA_DEPLOYMENT = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT")

azure_credential = azure.identity.DefaultAzureCredential()
token_provider = azure.identity.get_bearer_token_provider(azure_credential,
    "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2024-02-15-preview",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider)

def get_embedding(text):
    get_embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=text)
    return get_embeddings_response.data[0].embedding
    
def get_embeddings(sentences):
    embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=sentences)
    return [embedding_object.embedding for embedding_object in embeddings_response.data]


### Vector representations

In [2]:
# optimal size to embed is ~512 tokens
vector = get_embedding("im more of a cat person than a dog person but actually these days i just like humans") # 8192 tokens limit

In [3]:
vector # 8192 tokens

[-0.008702738210558891,
 0.00877292174845934,
 -0.010632780380547047,
 -0.005795976612716913,
 0.009410420432686806,
 0.002245867857709527,
 -0.052169639617204666,
 -0.010381290689110756,
 0.025008674710989,
 0.0013049721019342542,
 -0.012200209312140942,
 -0.027043992653489113,
 0.004316277336329222,
 -0.013966490514576435,
 -0.007521317806094885,
 -0.030763711780309677,
 -0.010030373930931091,
 0.004240245558321476,
 0.01203644834458828,
 0.00951569527387619,
 0.013112593442201614,
 -0.036495354026556015,
 -0.030950866639614105,
 0.011744017712771893,
 0.0029754824936389923,
 0.0030559010338038206,
 -0.008644252084195614,
 -0.013908004388213158,
 -0.02531280182301998,
 -0.011972113512456417,
 -0.01687910035252571,
 -0.00277516758069396,
 -0.016551578417420387,
 0.0025880117900669575,
 0.030202243477106094,
 -0.026810048148036003,
 0.054555874317884445,
 -0.020645609125494957,
 0.018926115706562996,
 0.04161873832345009,
 0.027886193245649338,
 0.016539881005883217,
 -0.04304580017924

In [4]:
len(vector)

3072

### Document similarity modeled as cosine distance

In [5]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['djkshsjdkhfsjdfkhsd',
              'This recent movie is so good',
              'The new movie is awesome']

embeddings1 = get_embeddings(sentences1)
embeddings2 = get_embeddings(sentences2)

for i in range(len(sentences1)):
    print(f"{sentences1[i]} \t\t {sentences2[i]} \t\t Score: {cosine_similarity(embeddings1[i], embeddings2[i]):.4f}")

The new movie is awesome 		 djkshsjdkhfsjdfkhsd 		 Score: 0.1257
The new movie is awesome 		 This recent movie is so good 		 Score: 0.6300
The new movie is awesome 		 The new movie is awesome 		 Score: 1.0000


### Vector search

In [7]:
# Load in vectors for movie titles
with open('openai_movies_v2.json') as json_file:
    movie_vectors = json.load(json_file)

In [8]:
# Compute vector for query
query = "My Neighbor Totoro"

embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=[query])
vector = embeddings_response.data[0].embedding

# Compute cosine similarity between query and each movie title
scores = []
for movie in movie_vectors:
    scores.append((movie, cosine_similarity(vector, movie_vectors[movie])))

# Display the top 10 results
df = pd.DataFrame(scores, columns=['Movie', 'Score'])
df = df.sort_values('Score', ascending=False)
df.head(10)

Unnamed: 0,Movie,Score
365,Sen to Chihiro no Kamikakushi,0.591363
517,The Secret World of Arietty,0.573118
481,Gake no ue no Ponyo,0.563192
534,Kaze Tachinu,0.562784
498,Gedo Senki (Tales from Earthsea),0.512276
416,Hauru no ugoku shiro,0.497954
321,The Tigger Movie,0.475844
447,Bridge to Terabithia,0.464797
377,Piglet's Big Movie,0.452052
161,The Nightmare Before Christmas,0.448373
