# Chunk Embedding using azure open ai   

### Load environment variables and keys 

In [1]:
from dotenv import dotenv_values

# specify the name of the .env file name 
env_name = "../../llm.env" # change to your own .env file name
config = dotenv_values(env_name)

#### Load the chunks and create embedding
In this section, we will load the data into a pandas dataframe, use select columns, and create vector embedding using azure open ai. 

In [2]:
import openai
import pandas as pd
import pandas as pd
import numpy as np
import time

openai.api_type = config["OPENAI_API_TYPE"] 
openai.api_key = config["OPENAI_API_KEY"]
openai.api_base = config["OPENAI_API_BASE"] 
openai.api_version = config["OPENAI_API_VERSION"] 

def createEmbeddings(text):
    response = openai.Embedding.create(input=text , engine=config["OPENAI_DEPLOYMENT_EMBEDDING"])
    embeddings = response['data'][0]['embedding']
    return embeddings

# Read data into a DataFrame
df = pd.read_csv('AnalyzedPDF/CombinedResults/Chunks.csv')

# Create a new column called 'embedding' in the DataFrame
df['Embedding'] = np.empty((len(df),), dtype=object)

# Iterate over each row in the DataFrame and assign the concatenation and embeddings
for index, row in df.iterrows():
    text = row['Chunk']
    
    # Concatenate the desired columns
    concat_text = f"{text}"
    
    # Create embeddings using the provided function
    embeddings = createEmbeddings(concat_text)
    #print(embeddings)
    
    # Assign the embeddings to the 'embedding' column
    df.at[index, 'Embedding'] = embeddings
    time.sleep(0.1)


We will rename the column names and add a new column as primary index.

In [3]:
# Print the DataFrame with 'Id' as the first column after index
df.head(1000)

Unnamed: 0,Id,Ticker,Year,Quarter,Chunk,PageNumber,LineNumber,Embedding
0,1,MSFT,23,1,Microsoft FY23 First Quarter Earnings Conferen...,1,1,"[-0.022691456601023674, -0.028929658234119415,..."
1,2,MSFT,23,1,"On the Microsoft Investor Relations website, y...",1,9,"[-0.022940216585993767, -0.008343684487044811,..."
2,3,MSFT,23,1,GAAP. They are included as additional clarifyi...,1,17,"[-0.01130777969956398, -0.0038822712376713753,..."
3,4,MSFT,23,1,"same in constant currency, we will refer to th...",2,6,"[-0.017685849219560623, -0.02943631075322628, ..."
4,5,MSFT,23,1,"predictions, projections, or other statements ...",2,14,"[-0.00915693398565054, -0.019673412665724754, ..."
...,...,...,...,...,...,...,...,...
437,438,MSFT,23,4,Can you just talk about where customers are ri...,44,19,"[-0.004939808044582605, 0.000936132506467402, ..."
438,439,MSFT,23,4,"complement, I'll call it, your databases, beca...",45,7,"[-0.0132768414914608, 0.004370962269604206, -0..."
439,440,MSFT,23,4,"with a very disruptive business model. I mean,...",45,15,"[-0.013180367648601532, -0.007650672923773527,..."
440,441,MSFT,23,4,"architecture lays out, our business model arou...",46,2,"[0.003990992438048124, -0.0018922516610473394,..."


Use the following code to save the embeddings and processed data for future use or skip the previous part of the code and and load the processed data to save into the db. 

In [5]:
# save CSV for future use. 
df.to_csv('AnalyzedPDF/CombinedResults/ChunksEmbedding.csv', index=False)