# Embeddings - Consine Similarity

- Simplest example of generating an embedding with a REST call

In [35]:
import common
import requests

"""curl https://YOUR_RESOURCE_NAME.openai.azure.com/openai/deployments/YOUR_DEPLOYMENT_NAME/embeddings?api-version=2023-05-15 \
  -H "Content-Type: application/json" \
  -H "api-key: YOUR_API_KEY" \
  -d "{\"input\": \"The food was delicious and the waiter...\"}"""

# Get a configured model
client = common.get_openai_client(api_key=common.api_KEY,
        api_version=common.api_version,
        azure_endpoint=common.api_URI)

In [36]:
def post_request(url:str, prompt:str):
    headers = {
        "Content-Type": "application/json",
        "api-key": common.api_KEY
    }
    response = requests.post(url, headers=headers, json={"input": prompt})
    return response.json()

In [37]:
def get_embedding(prompt:str):
    res = post_request(common.ada_full_URI, prompt)
    return (prompt,res['data'][0]['embedding'])

In [38]:
def cosine_similarity(v1, v2):
    dot_product = sum(a*b for a, b in zip(v1, v2))
    magnitude_A = sum(a*a for a in v1)**0.5
    magnitude_B = sum(b*b for b in v2)**0.5
    # cosine_similarity = dot_product(A*B) / (magnitude_A * magnitude_B) is the cosine of the angle
    # With numpy, it's simply:
    # dot_product = np.dot(A, B)
    # magnitude_A = np.linalg.norm(A)
    # magnitude_B = np.linalg.norm(B)
    # cosine_similarity = dot_product / (magnitude_A * magnitude_B)
    return dot_product / (magnitude_A * magnitude_B)

### Prepare the mock vector database

In [39]:
vector_database = [
    {"content":"The chemical composition of water is H2O.", "embedding":[]},
    {"content":"The speed of light is 300,000 km/s.", "embedding":[]},
    {"content":"Acceleration of gravity on earth is 9.8m/s^2.", "embedding":[]},
    {"content":"The chemical composition of salt or sodium clorida is NaCl.", "embedding":[]},
    {"content":"", "embedding":[]},
]
for prompt in vector_database:
    prompt["embedding"] = get_embedding(prompt["content"])[1]


### Embed the question

In [40]:
(p1,e1) = get_embedding("What is the speed of light?")

### Nearest search

In [41]:
limit =3
relevance=0.5
count = 0
local_list = []
for entry in vector_database:
    cs = cosine_similarity(e1, entry["embedding"])
    if cs>relevance:
        #print(entry)
        local_list.append({"content":entry["content"],"similarity":cs})
    count+=1    
    if count==limit:
        break
    #if (cs>0.7):
    #    print(f"Content: {entry["prompt"]} Similarity: {cs}")

local_list.sort(key=lambda x: x["similarity"], reverse=True)
for entry in local_list:
    print(entry)

{'content': 'The speed of light is 300,000 km/s.', 'similarity': 0.9076484494720922}
{'content': 'Acceleration of gravity on earth is 9.8m/s^2.', 'similarity': 0.8044927978430234}
{'content': 'The chemical composition of water is H2O.', 'similarity': 0.7380432345349796}
