### Set up

In [3]:
import json
import os

import azure.identity
import dotenv
import numpy as np
import openai
import pandas as pd

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_ADA_DEPLOYMENT = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT")

azure_credential = azure.identity.DefaultAzureCredential()
token_provider = azure.identity.get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2023-07-01-preview",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider)

def get_embedding(text):
    get_embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=text)
    return get_embeddings_response.data[0].embedding
    
def get_embeddings(sentences):
    embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=sentences)
    return [embedding_object.embedding for embedding_object in embeddings_response.data]


### Vector representations

In [4]:
vector = get_embedding("Hello, world!")

In [5]:
vector

[-0.0030003529973328114,
 0.016995597630739212,
 -0.01854763738811016,
 -0.03537420183420181,
 -0.009158567525446415,
 0.015766259282827377,
 -0.02676883526146412,
 -0.01817883551120758,
 -0.020652879029512405,
 -0.014744372107088566,
 0.031593985855579376,
 0.011686393991112709,
 -0.007087901700288057,
 -0.012731331400573254,
 -0.011586510576307774,
 -0.00703795999288559,
 0.017702467739582062,
 -0.0005743313231505454,
 0.006857400760054588,
 0.003722588997334242,
 -0.015044023282825947,
 -0.005797096993774176,
 0.01565100997686386,
 0.005151694640517235,
 -0.006745992228388786,
 -0.005558912642300129,
 0.008520849049091339,
 -0.02238931879401207,
 0.02298862114548683,
 -0.030579783022403717,
 0.00875134952366352,
 -0.03120981715619564,
 -0.0030541366431862116,
 -0.012523881159722805,
 0.007157051935791969,
 -0.027460336685180664,
 -0.010226555168628693,
 -0.016473129391670227,
 0.03084101714193821,
 -0.012062879279255867,
 0.010042154230177402,
 -9.454137034481391e-05,
 -0.0159506611

In [6]:
len(vector)

1536

### Document similarity modeled as cosine distance

In [7]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'This recent movie is so good',
              'The new movie is awesome']

embeddings1 = get_embeddings(sentences1)
embeddings2 = get_embeddings(sentences2)

for i in range(len(sentences1)):
    print(f"{sentences1[i]} \t\t {sentences2[i]} \t\t Score: {cosine_similarity(embeddings1[i], embeddings2[i]):.4f}")

The new movie is awesome 		 The dog plays in the garden 		 Score: 0.6682
The new movie is awesome 		 This recent movie is so good 		 Score: 0.9035
The new movie is awesome 		 The new movie is awesome 		 Score: 1.0000


### Vector search

In [8]:
# Load in vectors for movie titles
with open('openai_movies.json') as json_file:
    movie_vectors = json.load(json_file)

In [9]:
# Compute vector for query
query = "My Neighbor Totoro"

embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=[query])
vector = embeddings_response.data[0].embedding

# Compute cosine similarity between query and each movie title
scores = []
for movie in movie_vectors:
    scores.append((movie, cosine_similarity(vector, movie_vectors[movie])))

# Display the top 10 results
df = pd.DataFrame(scores, columns=['Movie', 'Score'])
df = df.sort_values('Score', ascending=False)
df.head(10)

Unnamed: 0,Movie,Score
221,Toy Story,0.788384
314,Toy Story 2,0.77518
524,Frankenweenie,0.772496
365,Sen to Chihiro no Kamikakushi,0.771209
411,Pooh's Heffalump Movie,0.770842
534,Kaze Tachinu,0.769281
28,The Fox and the Hound,0.768989
468,Beverly Hills Chihuahua,0.768988
561,Zootopia,0.768835
330,Coyote Ugly,0.768814
