# 1. Load Data

In [1]:
import pandas as pd
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS  # Changed from Chroma

In [2]:
data_path = Path("../data/processed/filtered_complaints.parquet")
df = pd.read_parquet(data_path)

# 2. Stratified Sampling (Target ~12,500)

In [4]:
frac = 12500 / len(df)
print(f"Sampling fraction: {frac:.4f}")

Sampling fraction: 0.0270


In [13]:
# Stratified sample by 'Product'
sampled_df = df.groupby('Product', group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=42))

print(f"Original shape: {df.shape}")
print(f"Sampled shape: {sampled_df.shape}")

Original shape: (462264, 11)
Sampled shape: (12499, 11)


  sampled_df = df.groupby('Product', group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=42))


# 3. Chunking Experiment

In [None]:

chunk_size = 500
chunk_overlap = 50

splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", ". ", " ", ""]
)

In [17]:
# Test on one long narrative
sample_text = sampled_df[sampled_df['word_count'] > 200].iloc[0]['cleaned_narrative']
chunks = splitter.split_text(sample_text)

print(f"\nOriginal Length: {len(sample_text)}")
print(f"Chunk 1: {chunks[0]}")


Original Length: 2282
Chunk 1: i am writing to formally submit a complaint against capital one concerning two disputed transactions with [REDACTED] the first dispute, claim # [REDACTED], was initiated in [DATE], involving a transaction amounting to {$11000.00}. the second dispute, claim # [REDACTED], was filed on [DATE], for an additional charge of {$4600.00}. both claims were denied by capital one, and i am deeply concerned about the lack of thorough investigation and proper resolution in both instances


In [25]:
for i, chunk in enumerate(chunks):
    print(f"index: {i}")
    print(f'Chunk: {chunk}')

index: 0
Chunk: i am writing to formally submit a complaint against capital one concerning two disputed transactions with [REDACTED] the first dispute, claim # [REDACTED], was initiated in [DATE], involving a transaction amounting to {$11000.00}. the second dispute, claim # [REDACTED], was filed on [DATE], for an additional charge of {$4600.00}. both claims were denied by capital one, and i am deeply concerned about the lack of thorough investigation and proper resolution in both instances
index: 1
Chunk: . in the initial dispute ( claim # [REDACTED] ), capital one 's response was received within one day, leaving me with the impression that a comprehensive investigation had not taken place. despite my attempts to pursue the matter further, subsequent communications were met with generic responses and an absence of supporting documentation. i took the initiative to contact capital one 's investigation team via email on [DATE], but unfortunately, there was no response
index: 2
Chunk: . r

# 4. Initialize Embedding Model & Test Vector Store

In [18]:
print("\nLoading Embedding Model...")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Loading Embedding Model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [19]:
# Quick test of FAISS
print("Creating test index...")
test_docs = chunks[:2]
vectorstore = FAISS.from_texts(test_docs, embedding_model)
print("FAISS index created successfully.")

Creating test index...
FAISS index created successfully.


In [23]:
df.head(3)

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company,State,Complaint ID,cleaned_narrative,word_count
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,"CITIBANK, N.A.",TX,14069121,a [REDACTED] [REDACTED] card was opened under ...,91
1,2025-06-13,Checking or savings account,Checking account,Managing an account,Deposits and withdrawals,I made the mistake of using my wellsfargo debi...,WELLS FARGO & COMPANY,ID,14061897,i made the mistake of using my wellsfargo debi...,109
2,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...","CITIBANK, N.A.",NY,14047085,"dear cfpb, i have a secured credit card with c...",156


In [24]:
df.iterrows()

<generator object DataFrame.iterrows at 0x000002027E646F00>

### Task 2 Summary – Technical Decisions

- **Sampling**: Stratified by `Product` → 12,499 rows (~2.7% fraction, random_state=42)  
- **Chunking**: 500 chars + 50 overlap (RecursiveCharacterTextSplitter) — tested on long narratives  
- **Embedding model**: all-MiniLM-L6-v2 — fast, lightweight, strong semantic performance  
- **Vector store**: FAISS (pivoted from ChromaDB due to Windows dependency issues)  
- **Output**: `vector_store/faiss_index/` with metadata-rich documents