### Set up

In [3]:
import json
import os

import azure.identity
import dotenv
import numpy as np
import openai
import pandas as pd

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_ADA_DEPLOYMENT = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT")

azure_credential = azure.identity.DefaultAzureCredential()
token_provider = azure.identity.get_bearer_token_provider(azure_credential,
    "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2023-07-01-preview",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider)

def get_embedding(text):
    get_embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=text)
    return get_embeddings_response.data[0].embedding
    
def get_embeddings(sentences):
    embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=sentences)
    return [embedding_object.embedding for embedding_object in embeddings_response.data]


### Vector representations

In [12]:
# optimal size to embed is ~512 tokens
vector = get_embedding("A dog just walked past my house and yipped yipped like a Martian") # 8192 tokens limit

In [13]:
vector

[-0.014229155145585537,
 -0.007121039554476738,
 -0.027062535285949707,
 0.01220657303929329,
 -0.003809303743764758,
 0.022009311243891716,
 0.009647651575505733,
 -0.018106309697031975,
 -0.0006470000953413546,
 -0.025111034512519836,
 0.016051417216658592,
 -0.0027721649967134,
 0.010571706108748913,
 -0.00550555856898427,
 0.011508685536682606,
 0.011637924239039421,
 0.02410297468304634,
 0.010119372047483921,
 0.01659421995282173,
 0.026829905807971954,
 -0.01093357428908348,
 0.02250041626393795,
 0.01924360729753971,
 -0.02339216321706772,
 -0.015379377640783787,
 -0.002851323690265417,
 0.02128557488322258,
 -0.015120900236070156,
 0.0008667054935358465,
 -0.009511951357126236,
 0.014022373594343662,
 -0.005957893095910549,
 -0.038719844073057175,
 0.005321393720805645,
 -0.021647443994879723,
 -0.02087201178073883,
 0.01649082824587822,
 -0.010817259550094604,
 0.009221164509654045,
 -0.02323707565665245,
 -0.004972449969500303,
 0.0074247498996555805,
 0.0056638759560883045,

In [11]:
len(vector)

1536

### Document similarity modeled as cosine distance

In [4]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['djkshsjdkhfsjdfkhsd',
              'This recent movie is so good',
              'The new movie is awesome']

embeddings1 = get_embeddings(sentences1)
embeddings2 = get_embeddings(sentences2)

for i in range(len(sentences1)):
    print(f"{sentences1[i]} \t\t {sentences2[i]} \t\t Score: {cosine_similarity(embeddings1[i], embeddings2[i]):.4f}")

The new movie is awesome 		 djkshsjdkhfsjdfkhsd 		 Score: 0.7467
The new movie is awesome 		 This recent movie is so good 		 Score: 0.9191
The new movie is awesome 		 The new movie is awesome 		 Score: 1.0000


### Vector search

In [16]:
# Load in vectors for movie titles
with open('openai_movies.json') as json_file:
    movie_vectors = json.load(json_file)

In [18]:
# Compute vector for query
query = "My Neighbor Totoro"

embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=[query])
vector = embeddings_response.data[0].embedding

# Compute cosine similarity between query and each movie title
scores = []
for movie in movie_vectors:
    scores.append((movie, cosine_similarity(vector, movie_vectors[movie])))

# Display the top 10 results
df = pd.DataFrame(scores, columns=['Movie', 'Score'])
df = df.sort_values('Score', ascending=False)
df.head(10)

Unnamed: 0,Movie,Score
221,Toy Story,0.868217
314,Toy Story 2,0.8503
28,The Fox and the Hound,0.84883
534,Kaze Tachinu,0.846195
495,Toy Story 3,0.845041
561,Zootopia,0.84281
365,Sen to Chihiro no Kamikakushi,0.841869
511,Winnie the Pooh,0.841173
1,Pinocchio,0.839548
481,Gake no ue no Ponyo,0.838221
