# CSV Loading Experiment
Testing CSV loader for e-commerce fashion data

In [1]:
import sys
sys.path.append('..')

from src.csv_loader import load_csv_as_documents, load_all_csvs_from_directory

  from .autonotebook import tqdm as notebook_tqdm


## Step 1: Load Fashion CSV

In [2]:
# Load all products from CSV
fashion_docs = load_csv_as_documents('../data/fashion/FashionDataset.csv')
print(f"\nTotal documents: {len(fashion_docs)}")

Loading CSV: ../data/fashion/FashionDataset.csv
Found 30758 valid products
Created 30758 documents from CSV

Total documents: 30758


## Step 2: Inspect Sample Documents

In [3]:
# Look at first product
print("First Product:")
print("="*60)
print(fashion_docs[0].page_content)
print("\nMetadata:")
print(fashion_docs[0].metadata)

First Product:
BrandName: life
Deatils: solid cotton blend collar neck womens a-line dress - indigo
Sizes: Size:Large,Medium,Small,X-Large,X-Small
Category: Westernwear-Women
Original Price: Rs
1699
Selling Price: 849
Discount: 50% off

Metadata:
{'source': '../data/fashion/FashionDataset.csv', 'source_type': 'csv', 'row_id': '0', 'brand': 'life', 'category': 'Westernwear-Women', 'sell_price': '849', 'mrp': 'Rs\n1699', 'discount': '50% off'}


In [4]:
# Look at a few more
for i in range(1, 4):
    print(f"\nProduct {i}:")
    print("-"*60)
    print(fashion_docs[i].page_content[:200] + "...")


Product 1:
------------------------------------------------------------
BrandName: only
Deatils: polyester peter pan collar womens blouson dress - yellow
Sizes: Size:34,36,38,40
Category: Westernwear-Women
Original Price: Rs
3499
Selling Price: 2449
Discount: 30% off...

Product 2:
------------------------------------------------------------
BrandName: fratini
Deatils: solid polyester blend wide neck womens regular top - off white
Sizes: Size:Large,X-Large,XX-Large
Category: Westernwear-Women
Original Price: Rs
1199
Selling Price: 599
Dis...

Product 3:
------------------------------------------------------------
BrandName: zink london
Deatils: stripes polyester sweetheart neck womens dress - black
Sizes: Size:Large,Medium,Small,X-Large
Category: Westernwear-Women
Original Price: Rs
2299
Selling Price: 1379
Di...


## Step 3: Category Analysis

In [5]:
# Count products by category
from collections import Counter

categories = [doc.metadata['category'] for doc in fashion_docs]
category_counts = Counter(categories)

print("Products by Category:")
print("="*60)
for category, count in category_counts.most_common(10):
    print(f"{category}: {count} products")

Products by Category:
Westernwear-Women: 10374 products
Indianwear-Women: 10374 products
Lingerie&Nightwear-Women: 3354 products
Footwear-Women: 2574 products
Watches-Women: 1794 products
Jewellery-Women: 1794 products
Fragrance-Women: 494 products


## Step 4: Check Document Format
Make sure documents are compatible with our existing RAG pipeline

In [6]:
# Verify document structure
sample = fashion_docs[100]
print(f"Document type: {type(sample)}")
print(f"Has page_content: {hasattr(sample, 'page_content')}")
print(f"Has metadata: {hasattr(sample, 'metadata')}")
print(f"\nContent length: {len(sample.page_content)} characters")
print(f"Metadata keys: {list(sample.metadata.keys())}")

Document type: <class 'langchain_core.documents.base.Document'>
Has page_content: True
Has metadata: True

Content length: 119 characters
Metadata keys: ['source', 'source_type', 'row_id', 'brand', 'category', 'sell_price', 'mrp', 'discount']


## Summary
- CSV loader creates LangChain Document objects
- Each product becomes one document
- Compatible with existing ChromaDB indexing
- Ready to integrate with PDF documents