# Setup

In [2]:
from openai import OpenAI
from openai import embeddings

In [3]:
from openai import __version__
print(__version__)

1.107.3


In [4]:
from dotenv import load_dotenv
import os
import json

load_dotenv(dotenv_path=".openaidev.env")



True

In [5]:
print(os.getenv("OPENAI_API_KEY")[:20], "...")
print(os.getenv("OPENAI_ORG")[:8], "...")
print(os.getenv("OPENAI_PROJECT")[:8], "...")

sk-proj-eUanLfMUnbXn ...
org-8d28 ...
proj_30j ...


## Envinronment Variables

Create a .env File: In the root of your project directory, create a file named `.env`. This file will hold your environment variables in the form of key-value pairs. For example:

```ini
    OPENAI_API_KEY = <openai_apikey>


    # optional 
    OPENAI_ORG = <openai_org_id>
    OPENAI_PROJECT = <openai_project_id>

```

In [14]:

# Set the API key and endpoint
api_key = os.getenv('OPENAI_API_KEY')
org = os.getenv('OPENAI_ORG')
project = os.getenv('OPENAI_PROJECT')


## OpenAI Client
Azure OpenAI provides two methods for authentication. You can use either API Keys or Microsoft Entra ID.

* API Key authentication: For this type of authentication, all API requests must include the API Key in the api-key HTTP header. 

* Microsoft Entra ID authentication: You can authenticate an API call using a Microsoft Entra token. Authentication tokens are included in a request as the Authorization header. The token provided must be preceded by Bearer, for example Bearer YOUR_AUTH_TOKEN. You can read our how-to guide on authenticating with Microsoft Entra ID.

We will be using API Key for this class. Store the API Key in the `.env` file, never share your API Key with others. 


In [15]:
client = OpenAI(
    api_key=api_key,
    organization=org,
    project=project
)

In [16]:
[m.id for m in client.models.list().data if m.id.__contains__("embedding")]

['text-embedding-3-small', 'text-embedding-3-large', 'text-embedding-ada-002']

# Embeddings

https://openai.com/index/introducing-text-and-code-embeddings/

In [28]:
embedding_model = "text-embedding-3-small"

In [29]:
embedding = client.embeddings.create(
    model=embedding_model,
    input=["Hello, world!", "Hola!"]
)

In [30]:
[type(e) for e in embedding]

[tuple, tuple, tuple, tuple]

In [34]:
for e in embedding.data:
    print(e.embedding)

[-0.019143931567668915, -0.025292053818702698, -0.0017211713129654527, 0.01883450709283352, -0.03382139280438423, -0.019682060927152634, -0.02102738246321678, 0.05160655081272125, -0.03218010067939758, -0.03043118305504322, -0.0021508336067199707, -0.028924422338604927, -0.0024871639907360077, -0.03148053586483002, 0.010291713289916515, 0.01856544241309166, -0.04614454507827759, 0.04140901193022728, 0.00043050304520875216, 0.04116685315966606, 0.053651440888643265, 0.0018481360748410225, 0.004564004950225353, 0.009955382905900478, 0.04781274124979973, 0.002164286794140935, -0.00984775647521019, 0.038422394543886185, 0.0009131372789852321, -0.05209086835384369, 0.051122233271598816, -0.032529886811971664, -0.01408552099019289, -0.012605667114257812, 0.013271600939333439, 0.01856544241309166, 0.0016320437425747514, -0.0008479732787236571, -0.012773832306265831, -0.029677802696824074, -0.004510191734880209, -0.015309764072299004, 0.02566874399781227, 0.009047290310263634, -0.0368349142372

In [21]:
embedding.data[0].embedding[:20]

[-0.01919545605778694,
 -0.02530249021947384,
 -0.0017007887363433838,
 0.018832262605428696,
 -0.03384426608681679,
 -0.019679713994264603,
 -0.020997973158955574,
 0.05160040035843849,
 -0.032149363309144974,
 -0.030373750254511833,
 -0.002162347314879298,
 -0.028974782675504684,
 -0.0024767788127064705,
 -0.031449880450963974,
 0.010310663841664791,
 0.018549779430031776,
 -0.046192850917577744,
 0.0414578802883625,
 0.00043759788968600333,
 0.04121575132012367]

In [22]:
embedding.data[1].embedding[:20]

[0.03653061389923096,
 -0.028329316526651382,
 0.01904255338013172,
 0.018198302015662193,
 0.009507875889539719,
 -0.015209921635687351,
 -0.02536773681640625,
 0.03602138161659241,
 -0.025381138548254967,
 -0.056658633053302765,
 -0.005350274033844471,
 -0.020878465846180916,
 0.0178900845348835,
 -0.03441328555345535,
 -0.002690213033929467,
 0.034868914633989334,
 -0.04859134554862976,
 -0.018104497343301773,
 -0.030875470489263535,
 0.07209637016057968]

## Cosine similarity

In [23]:
import numpy as np

In [24]:
# input=["feline friends say", "meow"]
input=['Hello, world!', 'Hola!']

resp = client.embeddings.create(
    model=embedding_model,
    input=input,

)

embedding_a = resp.data[0].embedding
embedding_b = resp.data[1].embedding

similarity_score = np.dot(embedding_a, embedding_b)
print(similarity_score)


0.5779476986310113


In [25]:
input=[
    'Azure OpenAI is a cloud-based AI platform.',
    'The Eiffel Tower.'
    ]

resp = client.embeddings.create(
    model=embedding_model,
    input=input,

)

embedding_a = resp.data[0].embedding
embedding_b = resp.data[1].embedding

similarity_score = np.dot(embedding_a, embedding_b)
print(similarity_score)

0.09228433168925644


In [26]:
input=[
    'The Sydney Harbour Bridge.',
    'The Eiffel Tower.'
    ]

resp = client.embeddings.create(
    model=embedding_model,
    input=input,

)

embedding_a = resp.data[0].embedding
embedding_b = resp.data[1].embedding

similarity_score = np.dot(embedding_a, embedding_b)
print(similarity_score)

0.4389014144161109


In [27]:
help(client.embeddings.create)

Help on method create in module openai.resources.embeddings:

create(
    *,
    input: 'Union[str, SequenceNotStr[str], Iterable[int], Iterable[Iterable[int]]]',
    model: 'Union[str, EmbeddingModel]',
    dimensions: 'int | NotGiven' = NOT_GIVEN,
    encoding_format: "Literal['float', 'base64'] | NotGiven" = NOT_GIVEN,
    user: 'str | NotGiven' = NOT_GIVEN,
    extra_headers: 'Headers | None' = None,
    extra_query: 'Query | None' = None,
    extra_body: 'Body | None' = None,
    timeout: 'float | httpx.Timeout | None | NotGiven' = NOT_GIVEN
) -> 'CreateEmbeddingResponse' method of openai.resources.embeddings.Embeddings instance
    Creates an embedding vector representing the input text.

    Args:
      input: Input text to embed, encoded as a string or array of tokens. To embed multiple
          inputs in a single request, pass an array of strings or array of token arrays.
          The input must not exceed the max input tokens for the model (8192 tokens for
          all emb