# Chunk Embedding using Azure OpenAI   

### Load environment variables and keys 

In [6]:
from dotenv import dotenv_values
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
import openai
import pandas as pd
import numpy as np
import time
import requests

# specify the name of the .env file name 
env_name = "../../.env" # change to your own .env file name
config = dotenv_values(env_name)

if config['KEYS_FROM'] == "KEYVAULT":
    print('keyvault was selected.')
    keyVaultName = config["KEY_VAULT_NAME"]
    KVUri = f"https://{keyVaultName}.vault.azure.net"

    credential = DefaultAzureCredential()
    client = SecretClient(vault_url=KVUri, credential=credential)

    openai.api_type = client.get_secret("OPENAI-API-TYPE").value
    openai.api_key = client.get_secret("OPENAI-API-KEY").value
    openai.api_base = client.get_secret("OPENAI-API-BASE").value
    openai.api_version = client.get_secret("OPENAI-API-VERSION").value
    deployment_embedding = client.get_secret("OPENAI-DEPLOYMENT-EMBEDDING").value
else:
    openai.api_type = config["OPENAI_API_TYPE"] 
    openai.api_key = config["OPENAI_API_KEY"]
    openai.api_base = config["OPENAI_API_BASE"] 
    openai.api_version = config["OPENAI_API_VERSION"] 
    deployment_embedding = config["OPENAI_DEPLOYMENT_EMBEDDING"]



keyvault was selected.


#### Load the chunks and create embedding
In this section, we will load the data into a pandas dataframe, use select columns, and create vector embedding using azure open ai. 

In [8]:
def createEmbeddings(text, endpoint, api_key, api_version, embedding_model_deployment):
    request_url = f"{endpoint}/openai/deployments/{embedding_model_deployment}/embeddings?api-version={api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    request_payload = {
        'input': text
    }
    embedding_response = requests.post(request_url, json=request_payload, headers=headers, timeout=None)
    if embedding_response.status_code == 200:
        data_values = embedding_response.json()["data"]
        embeddings_vectors = [data_value["embedding"] for data_value in data_values]
        return embeddings_vectors
    else:
        raise Exception(f"failed to get embedding: {embedding_response.json()}")


# Read data into a DataFrame
df = pd.read_csv('AnalyzedPDF/Chunks.csv')

# Create a new column called 'embedding' in the DataFrame
df['Embedding'] = np.empty((len(df),), dtype=object)

# Iterate over each row in the DataFrame and assign the concatenation and embeddings
for index, row in df.iterrows():
    text = row['Chunk']
    
    # Concatenate the desired columns
    concat_text = f"{text}"
    
    # Create embeddings using the provided function
    embeddings = createEmbeddings(concat_text,
        openai.api_base,
        openai.api_key,
        openai.api_version,
        deployment_embedding
    )[0]
    #print(embeddings)
    
    # Assign the embeddings to the 'embedding' column
    df.at[index, 'Embedding'] = embeddings
    time.sleep(0.1)


We will rename the column names and add a new column as primary index.

In [9]:
# Print the DataFrame with 'Id' as the first column after index
df.head(1000)

Unnamed: 0,Id,Ticker,Year,Quarter,Chunk,PageNumber,LineNumber,Embedding
0,1,MSFT,23,1,Microsoft FY23 First Quarter Earnings Conferen...,1,1,"[-0.022691458, -0.02892966, -0.01939041, -0.02..."
1,2,MSFT,23,1,"On the Microsoft Investor Relations website, y...",1,9,"[-0.022940217, -0.0083436845, -0.008599305, -0..."
2,3,MSFT,23,1,GAAP. They are included as additional clarifyi...,1,17,"[-0.01130778, -0.0038822712, 0.003553209, -0.0..."
3,4,MSFT,23,1,"same in constant currency, we will refer to th...",2,6,"[-0.01768585, -0.02943631, -0.00054391, -0.015..."
4,5,MSFT,23,1,"predictions, projections, or other statements ...",2,14,"[-0.009156934, -0.019673413, -0.0082705645, -0..."
...,...,...,...,...,...,...,...,...
437,438,MSFT,23,4,Can you just talk about where customers are ri...,44,19,"[-0.007982874, -0.011050153, 0.022338798, -0.0..."
438,439,MSFT,23,4,"complement, I'll call it, your databases, beca...",45,7,"[-0.0132768415, 0.0043709623, -0.0059512067, -..."
439,440,MSFT,23,4,"with a very disruptive business model. I mean,...",45,15,"[-0.017129857, -0.014327538, 0.016044645, -0.0..."
440,441,MSFT,23,4,"architecture lays out, our business model arou...",46,2,"[0.0039909924, -0.0018922517, 0.010486933, -0...."


Use the following code to save the embeddings and processed data for future use or skip the previous part of the code and and load the processed data to save into the db. 

In [10]:
# save CSV for future use. 
df.to_csv('AnalyzedPDF/ChunksEmbedding.csv', index=False)