In [2]:
# Add project root to sys.path so config can be imported
import sys
from pathlib import Path

# Set project root (assumes this notebook is in 'notebooks/')
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Imports
import pandas as pd
import numpy as np
import torch
import time
import os
from typing import List, Dict, Tuple
from tqdm.notebook import tqdm
import joblib

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from sentence_transformers import SentenceTransformer

import chromadb
from chromadb.config import Settings

# Import project-specific config
from config.config import PROCESSED_DATA_DIR, VECTOR_STORE_DIR

# Environment settings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Ensure vector store directory exists
VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)


Using device: mps


In [3]:
print("Loading complaint data...")
df = pd.read_csv(PROCESSED_DATA_DIR / 'filtered_complaints.csv')
print(f"Loaded {len(df)} complaints")
df.head()


Loading complaint data...
Loaded 443472 complaints


Unnamed: 0,complaint_narrative,cleaned_narrative,product,sub_product,issue,sub_issue,company,state,date_received,complaint_id
0,A XXXX XXXX card was opened under my name by a...,a [REDACTED] [REDACTED] card was opened under ...,credit card,store credit card,getting a credit card,card opened without my consent or knowledge,"citibank, n.a.",tx,2025-06-13,14069121
1,I made the mistake of using my wellsfargo debi...,i made the mistake of using my wellsfargo debi...,checking or savings account,checking account,managing an account,deposits and withdrawals,wells fargo & company,id,2025-06-13,14061897
2,"Dear CFPB, I have a secured credit card with c...",dear cfpb i have a secured credit card with ci...,credit card,general-purpose credit card or charge card,"other features, terms, or problems",other problem,"citibank, n.a.",ny,2025-06-12,14047085
3,I have a Citi rewards cards. The credit balanc...,i have a citi rewards cards the credit balance...,credit card,general-purpose credit card or charge card,incorrect information on your report,account information incorrect,"citibank, n.a.",il,2025-06-12,14040217
4,b'I am writing to dispute the following charge...,i am writing to dispute the following charges ...,credit card,general-purpose credit card or charge card,problem with a purchase shown on your statement,credit card company isn't resolving a dispute ...,"citibank, n.a.",tx,2025-06-09,13968411


In [4]:
def create_complaint_chunks(complaint_text: str, metadata: Dict, chunk_size: int = 500, chunk_overlap: int = 50) -> List[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    texts = splitter.split_text(complaint_text)
    texts = [text for text in texts if len(text.strip()) > 50]
    return [
        Document(page_content=text, metadata={**metadata, "chunk_index": i, "total_chunks": len(texts)})
        for i, text in enumerate(texts)
    ]


In [5]:
print("Creating chunks...")
chunk_size = 500
chunk_overlap = 50
all_documents = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Chunking complaints"):
    metadata = {
        'complaint_id': str(row['complaint_id']),
        'product': row['product'],
        'sub_product': row['sub_product'],
        'company': row['company'],
        'state': row['state'],
        'date_received': row['date_received']
    }
    chunks = create_complaint_chunks(row['cleaned_narrative'], metadata, chunk_size, chunk_overlap)
    all_documents.extend(chunks)

print(f"Total chunks created: {len(all_documents)}")


Creating chunks...


Chunking complaints:   0%|          | 0/443472 [00:00<?, ?it/s]

Total chunks created: 1336041


In [6]:
model_name = 'all-MiniLM-L6-v2'
embedding_model = SentenceTransformer(model_name, device=device)

def generate_embeddings(documents: List[Document], batch_size: int = 64) -> Tuple[List[str], List[Dict], List[np.ndarray]]:
    ids = [f"chunk_{doc.metadata['complaint_id']}_{doc.metadata['chunk_index']}" for doc in documents]
    texts = [doc.page_content for doc in documents]
    metadatas = [doc.metadata for doc in documents]

    embeddings = []
    total_time = 0
    n_batches = (len(texts) + batch_size - 1) // batch_size

    for i in tqdm(range(0, len(texts), batch_size), total=n_batches, desc="Generating embeddings"):
        batch_texts = texts[i:i + batch_size]
        start = time.time()
        batch_embeddings = embedding_model.encode(
            batch_texts, convert_to_numpy=True, show_progress_bar=False, batch_size=batch_size
        )
        embeddings.extend(batch_embeddings)
        total_time += time.time() - start

    print(f"Total time: {total_time:.2f}s | Avg speed: {len(texts)/total_time:.1f} docs/s")
    return ids, metadatas, embeddings


In [7]:
cache_path = Path("../.cache/cached_embeddings.pkl")

if cache_path.exists():
    print("Loading cached embeddings...")
    ids, metadatas, embeddings = joblib.load(cache_path)
else:
    ids, metadatas, embeddings = generate_embeddings(all_documents)
    joblib.dump((ids, metadatas, embeddings), cache_path)


Generating embeddings:   0%|          | 0/20876 [00:00<?, ?it/s]

Total time: 2011.38s | Avg speed: 664.2 docs/s


In [8]:
chroma_client = chromadb.Client(Settings(
    persist_directory=str(VECTOR_STORE_DIR),
    is_persistent=True
))

collection_name = "financial_complaints"
try:
    collection = chroma_client.get_collection(collection_name)
    print("Collection loaded.")
except:
    collection = chroma_client.create_collection(
        name=collection_name,
        metadata={"description": "Financial complaints embeddings"}
    )
    print("Collection created.")


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Collection created.


In [10]:
print("Indexing documents to ChromaDB in batches...")

# Configure safe batch size under the 5461 limit
batch_size = 1000

for i in tqdm(range(0, len(ids), batch_size), desc="Adding to ChromaDB"):
    batch_ids = ids[i:i + batch_size]
    batch_embeddings = embeddings[i:i + batch_size]
    batch_metadatas = metadatas[i:i + batch_size]
    batch_docs = [doc.page_content for doc in all_documents[i:i + batch_size]]
    
    collection.add(
        ids=batch_ids,
        embeddings=batch_embeddings,
        metadatas=batch_metadatas,
        documents=batch_docs
    )

print(f"\n✅ Total documents in store: {collection.count()}")


Indexing documents to ChromaDB in batches...


Adding to ChromaDB:   0%|          | 0/1337 [00:00<?, ?it/s]


✅ Total documents in store: 1336041


In [11]:
query_text = "I have an issue with my credit card payment"
query_embedding = embedding_model.encode(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2,
    include=['documents', 'metadatas', 'distances']
)

print("\nQuery Results:")
for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0], results['metadatas'][0], results['distances'][0]
)):
    print(f"\nResult {i+1}:")
    print(f"Distance: {distance:.4f}")
    print(f"Product: {metadata['product']}")
    print(f"Company: {metadata['company']}")
    print(f"Text: {doc[:200]}...")


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given



Query Results:

Result 1:
Distance: 0.4562
Product: credit card or prepaid card
Company: ally financial inc.
Text: i am making payments yet not getting to use my card...

Result 2:
Distance: 0.4648
Product: credit card or prepaid card
Company: avant holding company, inc.
Text: saying my payment did not go through though there are funds in the account i tried to pay with i have other credit cards ive no problem paying but this one never seems to take my payments...


In [13]:
def search_complaints(
    query_text: str,
    top_k: int = 3,
    filter_conditions: Dict = None,
    show_similarity: bool = True
):
    """
    Search ChromaDB for complaint chunks similar to the query.

    Args:
        query_text (str): User's question or issue.
        top_k (int): Number of results to return.
        filter_conditions (dict): Optional metadata filters.
        show_similarity (bool): If True, shows 1 - distance instead of raw distance.

    Returns:
        List of results printed to stdout.
    """
    query_embedding = embedding_model.encode(query_text)
    
    query_params = {
        "query_embeddings": [query_embedding],
        "n_results": top_k,
        "include": ["documents", "metadatas", "distances"]
    }

    if filter_conditions:
        query_params["where"] = filter_conditions

    results = collection.query(**query_params)

    print(f"\n🔍 Query: {query_text}")
    for i, (doc, metadata, distance) in enumerate(zip(
        results["documents"][0],
        results["metadatas"][0],
        results["distances"][0]
    )):
        score = 1 - distance if show_similarity else distance
        score_label = "Similarity" if show_similarity else "Distance"
        print(f"\nResult {i+1}:")
        print(f"{score_label}: {score:.4f}")
        print(f"Product: {metadata.get('product')}")
        print(f"Company: {metadata.get('company')}")
        print(f"Text: {doc[:250]}...")


In [14]:
search_complaints("My credit card was declined after payment went through")
search_complaints("The loan interest rate changed without notice")
search_complaints("They closed my savings account without warning")
search_complaints("I was charged twice on my debit card")
search_complaints("I can't send money internationally with my app")



🔍 Query: My credit card was declined after payment went through

Result 1:
Similarity: 0.6014
Product: credit card or prepaid card
Company: bread financial holdings, inc.
Text: okay i have a credit card through [REDACTED] [REDACTED] had it over a year and all was fine until now we are doing work to the house and i use this credit card i pay it off as soon as the charges appear on my statement i went to use it and it was dec...

Result 2:
Similarity: 0.5776
Product: credit card
Company: synchrony financial
Text: i tried to make a purchase my card was declined when i called i was told that my card was charged for the purchase and charged several times for the purchase and they refused to refund the money they also refused to check all previous transactions to...

Result 3:
Similarity: 0.5534
Product: credit card
Company: wells fargo & company
Text: tried to use credit card and purchases were declined multiple times i called customer service multiple times but help was not provided both 

In [15]:
# Run a set of test queries to evaluate semantic retrieval
test_queries = [
    "I was charged a late fee even though I paid on time",
    "They rejected my loan application without any explanation",
    "My savings account was frozen without warning"
]

for query in test_queries:
    search_complaints(query_text=query, top_k=2)



🔍 Query: I was charged a late fee even though I paid on time

Result 1:
Similarity: 0.7436
Product: credit card or prepaid card
Company: capital one financial corporation
Text: i was charged a late fee on of my credit cards and my payments were made on time...

Result 2:
Similarity: 0.7260
Product: credit card or prepaid card
Company: bread financial holdings, inc.
Text: late fee on time and now every month they ve been charging me a late fee on the late fee...

🔍 Query: They rejected my loan application without any explanation

Result 1:
Similarity: 0.6394
Product: money transfer, virtual currency, or money service
Company: enova international, inc.
Text: received a denial for a loan application i did not ask for did not apply for a loan...

Result 2:
Similarity: 0.5103
Product: payday loan
Company: borrowersfirst, inc.
Text: i applied for the personal loan two times and i was denied without getting the reason from the lender it is the borrowersfirst who asked to provide all the veri

In [16]:
# Run filtered queries limited to 'credit card or prepaid card' product
filtered_queries = [
    "My credit card keeps getting declined",
    "They keep increasing my credit card interest rate"
]

for query in filtered_queries:
    search_complaints(
        query_text=query,
        top_k=2,
        filter_conditions={"product": "credit card or prepaid card"}
    )



🔍 Query: My credit card keeps getting declined

Result 1:
Similarity: 0.6089
Product: credit card or prepaid card
Company: american express company
Text: i have one personal and business american express cards i have had my personal card for over years i have requested at least times for a credit limit increase and each time the system automatically declines me i have sent in my filed tax returns my c...

Result 2:
Similarity: 0.5673
Product: credit card or prepaid card
Company: wells fargo & company
Text: my wells fargo credit card is being declined for the past two months for at least times i have called various department of wells fargo wells fargo is not doing anything to fix the problem i am embarrassed every time my card is declined this decline ...

🔍 Query: They keep increasing my credit card interest rate

Result 1:
Similarity: 0.4770
Product: credit card or prepaid card
Company: u.s. bancorp
Text: and being careful not to spend they have never raised my interest rate for the