# Vector search


In [1]:
import json

with open('embeddings/movies_text-embedding-3-small-1536.json') as f:
    movies = json.load(f)

MODEL_DIMENSIONS = 1536 # Must match the dimension of the embeddings

## Set up the OpenAI client with GitHub Models

Run the cell below if you are using OpenAI with GitHub Models. If you're running this in GitHub Codespaces, the GITHUB_TOKEN environment variable will already be set for you. If you're running this locally, make sure to set the GITHUB_TOKEN environment variable with a Personal Access Token (PAT) from GitHub. 

In [2]:
import os
import openai

openai_client = openai.OpenAI(
    base_url="https://models.inference.ai.azure.com",
    api_key=os.environ["GITHUB_TOKEN"]
)
MODEL_NAME = "text-embedding-3-small"

## Set up the AzureOpenAI client

Run the cell below if you are using Azure OpenAI with keyless auth and have already set up a .env file with the required variables.

In [5]:
import os

from azure.identity import AzureCliCredential, get_bearer_token_provider
import dotenv
import openai

dotenv.load_dotenv()

azure_credential = AzureCliCredential()
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2024-10-21",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    azure_ad_token_provider=token_provider,
)
MODEL_NAME = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")

## Define function to compute embedding

In [3]:
def get_embedding(text):
    embeddings_response = openai_client.embeddings.create(
        model=MODEL_NAME,
        dimensions=MODEL_DIMENSIONS,
        input=text,
    )
    return embeddings_response.data[0].embedding

## Search vector embeddings

In [4]:
import pandas as pd

def cosine_similarity(v1, v2):
    dot_product = sum([a * b for a, b in zip(v1, v2)])
    magnitude = (sum([a**2 for a in v1]) * sum([a**2 for a in v2])) ** 0.5
    return dot_product / magnitude

def exhaustive_search(query_vector, vectors):
    similarities = []
    for title, vector in vectors.items():
        similarity = cosine_similarity(query_vector, vector)
        similarities.append((title, similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities

new_vector = get_embedding("a toddler-friendly movie about cats")
similarities = exhaustive_search(new_vector, movies)
most_similar = similarities[0:10]
similar_movies = [(movie, round(similarity, 3)) for movie, similarity in most_similar]

pd.DataFrame(similar_movies, columns=['movie', 'similarity'])

Unnamed: 0,movie,similarity
0,The Aristocats,0.518
1,The Tigger Movie,0.5
2,Ratatouille,0.488
3,Cars 2,0.465
4,Pooh's Heffalump Movie,0.459
5,The Fox and the Hound,0.458
6,African Cats,0.456
7,101 Dalmatians,0.456
8,Teacher's Pet: The Movie,0.454
9,A Goofy Movie,0.454


## ANN search: HNSW


In [5]:
import hnswlib

# Declaring index
p = hnswlib.Index(space='cosine', dim=1536)

# Initializing index - the maximum number of elements should be known beforehand
p.init_index(max_elements=len(movies), ef_construction=200, M=16)

# Element insertion (can be called several times):
vectors = list(movies.values())
ids = list([i for i in range(len(vectors))])
p.add_items(vectors, ids)

# Controlling the recall by setting ef:
p.set_ef(50) # ef should always be > k

### Index parameters are exposed as class properties:
print(f"Parameters passed to constructor:  space={p.space}, dim={p.dim}") 
print(f"Index construction: M={p.M}, ef_construction={p.ef_construction}")
print(f"Index size is {p.element_count} and index capacity is {p.max_elements}")
print(f"Search speed/quality trade-off parameter: ef={p.ef}")

Parameters passed to constructor:  space=cosine, dim=1536
Index construction: M=16, ef_construction=200
Index size is 573 and index capacity is 573
Search speed/quality trade-off parameter: ef=50


In [6]:
# Search the HNSW index
new_vector = get_embedding("a toddler-friendly movie about cats")

labels, distances = p.knn_query(new_vector, k=10)

# match labels to movie titles and print out
similar_movies = [(list(movies.keys())[label], round(1 - distance, 3)) for label, distance in zip(labels[0], distances[0])]
pd.DataFrame(similar_movies, columns=['movie', 'similarity'])

Unnamed: 0,movie,similarity
0,The Aristocats,0.518
1,The Tigger Movie,0.5
2,Ratatouille,0.488
3,Cars 2,0.465
4,Pooh's Heffalump Movie,0.459
5,The Fox and the Hound,0.458
6,African Cats,0.456
7,101 Dalmatians,0.456
8,Teacher's Pet: The Movie,0.454
9,A Goofy Movie,0.454
