In [None]:
%pip install -U nltk voyageai scipy numpy pandas
#!pip install -U nltk 

In [4]:
#############################################################################
###### my data are several text files, each file is one book chapter ########
#############################################################################

##### defining chunking function - 

# function to split sentence
import nltk.data

# banana split by sentence
def load_and_split_text_by_sentences(file_path, window_size=5, overlap=2):
    """Loads a text file and splits it into overlapping windows of sentences.

    Args:
        file_path (str): The path to the text file.
        window_size (int, optional): The number of sentences in each window. Defaults to 3.
        overlap (int, optional): The number of sentences to overlap between windows. Defaults to 2.

    Returns:
        list: A list of text snippets, where each snippet is a window of sentences.
    """

    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Remove new lines and extra spaces
    text = text.replace('\n', ' ').strip()

    # Download Punkt sentence tokenizer if not already downloaded
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    # Tokenize into sentences
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(text)

    # Create overlapping windows of sentences
    documents = []
    for i in range(0, len(sentences) - window_size + 1, window_size - overlap):
        window = " ".join(sentences[i: i + window_size])
        documents.append(window)

    return documents


In [8]:
import time
import os
import voyageai
import scipy
import numpy as np
import pandas as pd

#####################################################################################################################
## the main function. adapt "wind" and "olap" and replace the function for chunking, if you want
## this function creates:
## (1) for each given text file the desired chunking
## (2) embeddings of all text-chunks in the given text-file, saved as a CSV.
## (3) additionally, the used text-chunks are stored in a seperate text file, saved as a CSV.
## this means, if you provide two text files, then in the end you'll receive two CSV files with the chunk embeddings 
## and two CSV-files with the underlying text chunks.
#####################################################################################################################

def process_file(file_path):
    """
    Processes a single text file, generates embeddings and text chunks, and saves them to CSV files.
    """
    wind = 5 # size of window
    olap = 2 # size of overlap
    documents = load_and_split_text_by_sentences(file_path, window_size=wind, overlap=olap) #function used to chunk the text

    vo = voyageai.Client(api_key="your API Key")

    batch_size = 128 # parallel encoding, increases speed by approx 128 times.
    documents_embeddings = []
    total_batches = (len(documents) + batch_size - 1) // batch_size

    for batch_num, i in enumerate(range(0, len(documents), batch_size)):
        print(f"Processing batch {batch_num+1}/{total_batches}")
        embeddings = vo.embed(
            documents[i : i + batch_size],
            model="voyage-3-large",
            input_type="document",
        ).embeddings
        documents_embeddings.append(embeddings)

        # Add a delay between queries
        time.sleep(1)
    
    flattened_embeddings = [embedding for batch in documents_embeddings for embedding in batch]

    embeddings_df = pd.DataFrame(flattened_embeddings)
    text_chunks_df = pd.DataFrame({"text": documents})

    file_name = os.path.splitext(os.path.basename(file_path))[0]
    embeddings_df.to_csv(f"embeddings_{file_name}_w{wind}_o{olap}_v3l.csv", index=False, header=False)
    text_chunks_df.to_csv(f"text_chunks_{file_name}_w{wind}_o{olap}_v3l.csv", index=False, header=False)

# Assuming all files are in the same directory
directory_path = r"folder/path"

for chapter_num in range(1, 28):
    file_name = f"Chapter{chapter_num:02d}.txt" # adapt file name scheme
    file_path = os.path.join(directory_path, file_name)
    process_file(file_path)


# in R, we combine all embeddings-files

Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/2
Processing batch 2/2
Processing batch 1/1
Processing batch 1/2
Processing batch 2/2
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/2
Processing batch 2/2
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
Processing batch 1/1
