# Chunk Embedding using Azure OpenAI   

### Load environment variables and keys 

In [1]:
from dotenv import dotenv_values
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
import openai
import pandas as pd
import numpy as np
import time
import requests

# specify the name of the .env file name 
env_name = "../../.env" # change to your own .env file name
config = dotenv_values(env_name)

if config['KEYS_FROM'] == "KEYVAULT":
    print('keyvault was selected.')
    keyVaultName = config["KEY_VAULT_NAME"]
    KVUri = f"https://{keyVaultName}.vault.azure.net"

    credential = DefaultAzureCredential()
    client = SecretClient(vault_url=KVUri, credential=credential)

    openai.api_type = client.get_secret("OPENAI-API-TYPE").value
    openai.api_key = client.get_secret("OPENAI-API-KEY").value
    openai.api_base = client.get_secret("OPENAI-API-BASE").value
    openai.api_version = client.get_secret("OPENAI-API-VERSION").value
    deployment_embedding = client.get_secret("OPENAI-DEPLOYMENT-EMBEDDING").value
else:
    openai.api_type = config["OPENAI_API_TYPE"] 
    openai.api_key = config["OPENAI_API_KEY"]
    openai.api_base = config["OPENAI_API_BASE"] 
    openai.api_version = config["OPENAI_API_VERSION"] 
    deployment_embedding = config["OPENAI_DEPLOYMENT_EMBEDDING"]



#### Load the chunks and create embedding
In this section, we will load the data into a pandas dataframe, use select columns, and create vector embedding using azure open ai. 

In [2]:
def createEmbeddings(text, endpoint, api_key, api_version, embedding_model_deployment):
    request_url = f"{endpoint}/openai/deployments/{embedding_model_deployment}/embeddings?api-version={api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    request_payload = {
        'input': text
    }
    embedding_response = requests.post(request_url, json=request_payload, headers=headers, timeout=None)
    if embedding_response.status_code == 200:
        data_values = embedding_response.json()["data"]
        embeddings_vectors = [data_value["embedding"] for data_value in data_values]
        return embeddings_vectors
    else:
        raise Exception(f"failed to get embedding: {embedding_response.json()}")


# Read data into a DataFrame
df = pd.read_csv('AnalyzedPDF/Chunks.csv')

# Create a new column called 'embedding' in the DataFrame
df['Embedding'] = np.empty((len(df),), dtype=object)

# Iterate over each row in the DataFrame and assign the concatenation and embeddings
for index, row in df.iterrows():
    text = row['Chunk']
    
    # Concatenate the desired columns
    concat_text = f"{text}"
    
    # Create embeddings using the provided function
    embeddings = createEmbeddings(concat_text,
        openai.api_base,
        openai.api_key,
        openai.api_version,
        deployment_embedding
    )[0]
    #print(embeddings)
    
    # Assign the embeddings to the 'embedding' column
    df.at[index, 'Embedding'] = embeddings
    time.sleep(0.1)


We will rename the column names and add a new column as primary index.

In [3]:
# Print the DataFrame with 'Id' as the first column after index
df.head(1000)

Unnamed: 0,Id,Ticker,Year,Quarter,Chunk,PageNumber,LineNumber,Embedding
0,1,MSFT,23,4,Microsoft FY23 Fourth Quarter Earnings Confere...,1,1,"[-0.023074152, -0.025937367, -0.019705663, -0...."
1,2,MSFT,23,4,press release and financial summary slide deck...,1,9,"[-0.018870514, -0.009296308, -0.00529658, -0.0..."
2,3,MSFT,23,4,to the measures of financial performance prepa...,1,17,"[-0.013240397, -0.0035947678, 0.0057165413, -0..."
3,4,MSFT,23,4,effect of foreign currency rate fluctuations. ...,2,5,"[-0.022699066, -0.032132443, 0.002210948, -0.0..."
4,5,MSFT,23,4,"During this call, we will be making forward-lo...",2,13,"[-0.008702822, -0.016210131, -0.016197275, -0...."
...,...,...,...,...,...,...,...,...
437,438,MSFT,23,3,"infrastructure apps in other areas. So, it wou...",41,15,"[-0.0027570303, -0.01633089, 0.024878025, -0.0..."
438,439,MSFT,23,3,"Microsoft 365 suite value, which is significan...",42,2,"[-3.3282693e-05, -0.009024003, 0.017882427, -0..."
439,440,MSFT,23,3,"that our services, as they are competitive, ge...",42,10,"[-0.0068266504, -0.015404745, 0.025249982, -0...."
440,441,MSFT,23,3,Microsoft 365 to close these loops in a new wa...,42,18,"[-0.020697111, -0.021852173, -0.0063931374, -0..."


Use the following code to save the embeddings and processed data for future use or skip the previous part of the code and and load the processed data to save into the db. 

In [4]:
# save CSV for future use. 
df.to_csv('AnalyzedPDF/ChunksEmbedding.csv', index=False)