In [None]:
import os
import pickle

import torch
import pandas as pd
from tqdm import tqdm

from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {DEVICE}')

Using device: cuda


In [None]:
api_key = "YOUR-API-KEY"
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")

  embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")


In [None]:
# Load data
DATA_PATH = './data/'

pickle_file = os.path.join(DATA_PATH, 'prepd_data.pkl')
data = pd.read_pickle(pickle_file)

In [5]:
# Convert dataframe rows to LangChain Documents
docs = [
    Document(page_content=row['combined_info_pp'], metadata={"book_id": row['book_id'], "headline": row["headline"], "from_date": row["from_date"], "to_date": row["to_date"]})
    for idx, row in data.iterrows()
]

In [6]:
def split_text_by_words(text, max_words, overlap_words):
    # Split the text into words
    words = text.split()

    chunks = []
    start_idx = 0
    while start_idx < len(words):
        end_idx = min(start_idx + max_words, len(words))
        chunk = words[start_idx:end_idx]

        # Join the words back into text
        chunk_text = ' '.join(chunk)
        chunks.append(chunk_text)

        # Update the starting position with overlap
        start_idx += max_words - overlap_words

    return chunks

In [18]:
chunks = []
idx = 0

for doc in docs:
    temp_chunks = split_text_by_words(
        doc.page_content, max_words=200, overlap_words=20
    )
    
    for chunk in temp_chunks:
        new_metadata = dict(doc.metadata) 
        new_metadata['idx'] = idx 
        chunks.append(Document(page_content=chunk, metadata=new_metadata))
        idx += 1

In [20]:
new_data_path = os.path.join(DATA_PATH, "chunks.pkl")
with open(new_data_path, "wb") as file: 
    pickle.dump(chunks, file)

In [21]:
# Check the number of chunks created
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 26579


In [None]:
def create_and_save_embeddings(documents, embedding_model, batch_size=50, output_path="embeddings.pkl"):
    texts = [doc.page_content for doc in documents]
    metas = [doc.metadata for doc in documents]

    all_embeddings = []
    all_texts = []
    all_metas = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i+batch_size]
        batch_metas = metas[i:i+batch_size]

        try:
            batch_embeddings = embedding_model.embed_documents(batch_texts)
        except Exception as e:
            print(f"Error at batch {i}: {e}")
            continue

        all_embeddings.extend(batch_embeddings)
        all_texts.extend(batch_texts)
        all_metas.extend(batch_metas)

    with open(output_path, "wb") as f:
        pickle.dump({
            "texts": all_texts,
            "metas": all_metas,
            "embeddings": all_embeddings
        }, f)

    print(f"Saved {len(all_embeddings)} embeddings to {output_path}")