In [59]:
from openai import OpenAI
from tqdm import tqdm
import pickle
import pandas as pd

client = OpenAI(api_key = "sk-proj-IBdB7RZb5nBBaQ92DzdXT3BlbkFJOwqaJ1hWIdDaQ67uv261")
embedding_batch_size = 512

In [33]:
file_text_chunks = open('text_chunks.pkl', 'rb') 
text_chunks = pickle.load(file_text_chunks)
print(len(text_chunks))

2688


In [42]:
def create_batches(data_dict, max_batch_size=512):
    """
    Creates batches of key-value pairs from a dictionary, ensuring each batch doesn't exceed the specified size.

    Args:
        data_dict: A dictionary where values are lists of texts and keys are texts.
        max_batch_size: Maximum number of key-value pairs allowed in a single batch.

    Returns:
        A list of batches, where each batch is a list of tuples (key, value).
    """
    
    batches = []
    current_batch = []
    current_batch_size = 0
    
    for key, values in tqdm(data_dict.items(), desc="Creating batches"):
        for value in values:
            # Check if adding this pair would exceed the batch size
            if current_batch_size + 1 > max_batch_size:
                batches.append(current_batch)
                current_batch = []
                current_batch_size = 0

            current_batch.append((key, value))
            current_batch_size += 1

    # Append the last batch if it's not empty
    if current_batch:
        batches.append(current_batch)
    
    return batches

In [32]:
def get_embedding(list_texts, model="text-embedding-3-small"):
    list_texts_replaced = [t.replace("\n", " ") for t in list_texts]
    embeddings = client.embeddings.create(input = list_texts_replaced, model=model).data
    return [x.embedding for x in embeddings]

#get_embedding(["I love Kitty", "I am Danny Moldovan"], model='text-embedding-3-small')

In [57]:
def create_embeddings_for_text_chunks(text_chunks, model="text-embedding-3-small", embedding_batch_size = 512):
    batches_text_chunks = create_batches(text_chunks, embedding_batch_size)
    
    embeddings = {}

    for batch in tqdm(batches_text_chunks, desc="Computing embeddings for batches of text chunks"):  # Iterate with progress bar
        filenames = [k for (k, v) in batch]
        chunks = [v for (k, v) in batch]
        batch_embeddings = get_embedding(chunks)
    
        for (f, e) in zip(filenames, batch_embeddings):
            if f not in embeddings.keys():
                embeddings[f] = [e]
            else:
                embeddings[f].append(e)

    return embeddings

In [58]:
embeddings = create_embeddings_for_text_chunks(text_chunks)
len(embeddings.keys())

Creating Batches: 100%|██████████| 2688/2688 [00:00<00:00, 176061.73it/s]
Computing embeddings for batches of text chunks: 100%|██████████| 18/18 [03:18<00:00, 11.04s/it]


2688

In [79]:
def convert_dictionaries_to_dataframe(text_chunks, embeddings):
    text_chunk_items = list(text_chunks.items())
    embedding_items = list(embeddings.items())
    
    flattened_data_chunks = [(f, c) for f, l in text_chunk_items for c in l]
    flattened_data_embeddings = [(f, e) for f, l in embedding_items for e in l]
    
    flattened_data_combined = [(f, c, e) for (f, c), (g, e) in list(zip(flattened_data_chunks, flattened_data_embeddings))]
    
    df = pd.DataFrame(flattened_data_combined, columns=['filename', 'text chunk', 'embedding'])
    return df

In [81]:
df = convert_dictionaries_to_dataframe(text_chunks, embeddings)
print(len(df))
df.head()

8843


Unnamed: 0,filename,text chunk,embedding
0,apidocs/api.python.langchain.com/en/latest/_mo...,\n\n\n\nOverview: module code — 🦜🔗 LangChain ...,"[-0.038963668048381805, 0.060036975890398026, ..."
1,apidocs/api.python.langchain.com/en/latest/_mo...,.base\nlangchain.agents.chat.output_parser\nla...,"[0.013684544712305069, 0.05675216019153595, 0...."
2,apidocs/api.python.langchain.com/en/latest/_mo...,.langchain\nlangchain.callbacks.tracers.langch...,"[0.012743242084980011, 0.04998200014233589, 0...."
3,apidocs/api.python.langchain.com/en/latest/_mo...,chain.chains.openai_functions.qa_with_structur...,"[0.015767386183142662, 0.030865758657455444, 0..."
4,apidocs/api.python.langchain.com/en/latest/_mo...,_loaders.arxiv\nlangchain.document_loaders.ass...,"[0.02353993058204651, 0.05262213572859764, 0.0..."


In [83]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-16.1.0


In [84]:
df.to_feather("text_chunks_and_embedding_dataframe.feather") 