In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load the filtered dataset
filtered_df = pd.read_csv('../filtered_complaints/filtered_complaints.csv')

# Define chunking function
def chunk_text(text, chunk_size=128, chunk_overlap=64):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - chunk_overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        if chunk:
            chunks.append(chunk)
    return chunks

# Create a list for all chunks and their corresponding IDs
all_chunks = []
ids = []

for index, row in filtered_df.iterrows():
    chunks = chunk_text(row['Consumer complaint narrative'])
    all_chunks.extend(chunks)
    ids.extend([row['Complaint ID']] * len(chunks))

# Initialize the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for each chunk
embeddings = model.encode(all_chunks, batch_size=10, show_progress_bar=True)

# Create a FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype(np.float32))

# Save the embeddings and IDs for later retrieval
np.save('vector_store/embeddings.npy', embeddings)
np.save('vector_store/ids.npy', ids)

print("Chunking, embedding, and indexing completed successfully.")

Batches:  15%|█▌        | 4505/29860 [45:29<4:16:01,  1.65it/s]


KeyboardInterrupt: 