In [1]:
import os
import pandas as pd

# Define the path to the directory with the CSV files
chunks_dir = "C:\\Users\\olive\\Desktop\\UGent\\Thesis\\Pratice\\chunks"

# Get all CSV filenames in that directory
chunks_files = [f for f in os.listdir(chunks_dir) if f.endswith(".csv")]

# Dictionary to store each DataFrame with filename (without .csv) as key
dataframes = {}

# Read each CSV into a pandas DataFrame
for file in chunks_files:
    file_path = os.path.join(chunks_dir, file)
    df_name = os.path.splitext(file)[0]  # filename without extension
    dataframes[df_name] = pd.read_csv(file_path)


text_chunks_df = {}

for name, df in dataframes.items():
    text_chunks_df[name] = df['sentence_chunk'].tolist()

In [2]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

from sentence_transformers import SentenceTransformer

In [3]:
model = SentenceTransformer("intfloat/multilingual-e5-large-instruct")

In [4]:
import numpy as np

# Loop through each chunk
for name, df in dataframes.items():
    print(f"Processing chunk: {name}")
    
    # Get the list of sentence chunks
    sentences = df['sentence_chunk'].tolist()
    
    # Prefix each chunk with "passage: "
    sentences_prefixed = [f"passage: {s}" for s in sentences]

    # Compute the embeddings with the prefix
    embeddings = model.encode(sentences_prefixed, batch_size=32, convert_to_tensor=True)

    # Convert embeddings to list of numpy arrays and assign to new column
    df['embedding'] = embeddings.cpu().numpy().tolist()
    
    # Save the DataFrame with embeddings back to CSV (overwrite or new name)
    save_path = os.path.join(chunks_dir, f"{name}.csv")
    df.to_csv(save_path, index=False)

print("All chunks processed and saved with embeddings.")


Processing chunk: 10201929_chunks
Processing chunk: 10229841_chunks
Processing chunk: 10372271_chunks
Processing chunk: 10423406_chunks
Processing chunk: 1675604_chunks
Processing chunk: 1765275_chunks
Processing chunk: 1883525_chunks
Processing chunk: 2109187_chunks
Processing chunk: 7609053_chunks
Processing chunk: 7989745_chunks
Processing chunk: 8183915_chunks
Processing chunk: 9111040_chunks
Processing chunk: 9177216_chunks
Processing chunk: 9188842_chunks
Processing chunk: 9299399_chunks
Processing chunk: 9419430_chunks
Processing chunk: 9707608_chunks
Processing chunk: 9858618_chunks
Processing chunk: 9889197_chunks
Processing chunk: 9989503_chunks
All chunks processed and saved with embeddings.
