In [None]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from pathlib import Path
import os

# Set working directory
os.chdir('C:/Users/daniel.shobe/Desktop/schoolify/B5W6/creditrust-complaint-analysis')
print(f"Working directory set to: {os.getcwd()}")

# Set up directories
DATA_DIR = Path('data')
VECTOR_STORE_DIR = Path('vector_store')
VECTOR_STORE_DIR.mkdir(exist_ok=True)

# Load cleaned dataset
filtered_df = pd.read_csv(DATA_DIR / 'filtered_complaints.csv')
print(f"Loaded dataset: {filtered_df.shape}")

# Step 1: Chunk narratives
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    length_function=len
    )
chunks = []
for idx, row in filtered_df.iterrows():
    split_texts = text_splitter.split_text(row['cleaned_narrative'])
    for chunk in split_texts:
        chunks.append({
            'Complaint ID': row['Complaint ID'],
            'Product': row['Product'],
            'chunk': chunk
        })
chunked_df = pd.DataFrame(chunks)
chunked_df.to_csv(DATA_DIR / 'chunked_complaints.csv', index=False)
print(f"Created {len(chunked_df)} chunks")

# Step 2: Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunked_df['chunk'].tolist(), show_progress_bar=True)
print(f"Generated embeddings: {embeddings.shape}")

# Step 3: Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
faiss.write_index(index, str(VECTOR_STORE_DIR / 'complaint_index.faiss'))
print(f"Created FAISS index with {index.ntotal} vectors")

# Step 4: Save metadata
chunked_df.to_csv(VECTOR_STORE_DIR / 'chunk_metadata.csv', index=False)
print(f"Saved metadata to {VECTOR_STORE_DIR / 'chunk_metadata.csv'}")

  from tqdm.autonotebook import tqdm, trange


Working directory set to: C:\Users\daniel.shobe\Desktop\schoolify\B5W6\creditrust-complaint-analysis
Loaded dataset: (211097, 20)
Created 561133 chunks


Batches:   0%|          | 0/17536 [00:00<?, ?it/s]