## CSV and excel files- Structured Data

In [43]:
import pandas as pd
import os

In [44]:
os.makedirs("data/struct_files",exist_ok=True)

In [45]:
# Sample data for csv
data = {
    "id": [1, 2, 3, 4],
    "name": ["Rahul", "Aisha", "Karan", "Meera"],
    "role": ["Data Analyst", "ML Engineer", "Backend Developer", "Intern"],
    "experience_years": [2, 3, 1, 0],
    "location": ["Bangalore", "Hyderabad", "Pune", "Remote"]
}

df = pd.DataFrame(data)
df.to_csv("data/struct_files/employees.csv", index=False)

In [46]:
# Sample data for excel
data = {
    "order_id": [101, 102, 103, 104],
    "customer": ["Ankit", "Riya", "Mohit", "Sara"],
    "product": ["Laptop", "Keyboard", "Monitor", "Mouse"],
    "quantity": [1, 2, 1, 3],
    "price": [65000, 2500, 12000, 700]
}

# Create dataframe
df = pd.DataFrame(data)

# Save as Excel file
df.to_excel("data/struct_files/sales_sample.xlsx", index=False)

## CSV Processing

## âœ… Method 1 â€” CSVLoader (Simple & Quick)
### Use when:
- Each row = one document
- No custom formatting needed
- OK with default structure
- Good for small files / testing / demos

### ðŸ§  Think of it as: "Fast load, minimal control"

## âœ… Method 2 â€” Custom CSV Processing (Full Control)
### Use when:
- You want custom page_content
- Need rich metadata + cleaning
- Group / chunk rows (teams, dept, etc.)
- Handle large files efficiently
- Skip / validate bad rows

### ðŸ§  Think of it as: "You control text, metadata, and structure"


In [47]:
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders import UnstructuredCSVLoader

In [48]:
# Method 1: CSVLoader - Each row becomes a document
print("CSVLoader - Row-based Documents")
csv_loader = CSVLoader(
    file_path="data/struct_files/employees.csv",
    encoding="utf-8",
    csv_args={
        "delimiter": ",",
        "quotechar": '"'
    }
)

csv_docs = csv_loader.load()
print(csv_docs)
print(f"Loaded {len(csv_docs)} documents (one per row)")
print("\nFirst document content:")
print(f"Content: {csv_docs[0].page_content}")
print(f"Metadata: {csv_docs[0].metadata}")


CSVLoader - Row-based Documents
[Document(metadata={'source': 'data/struct_files/employees.csv', 'row': 0}, page_content='id: 1\nname: Rahul\nrole: Data Analyst\nexperience_years: 2\nlocation: Bangalore'), Document(metadata={'source': 'data/struct_files/employees.csv', 'row': 1}, page_content='id: 2\nname: Aisha\nrole: ML Engineer\nexperience_years: 3\nlocation: Hyderabad'), Document(metadata={'source': 'data/struct_files/employees.csv', 'row': 2}, page_content='id: 3\nname: Karan\nrole: Backend Developer\nexperience_years: 1\nlocation: Pune'), Document(metadata={'source': 'data/struct_files/employees.csv', 'row': 3}, page_content='id: 4\nname: Meera\nrole: Intern\nexperience_years: 0\nlocation: Remote')]
Loaded 4 documents (one per row)

First document content:
Content: id: 1
name: Rahul
role: Data Analyst
experience_years: 2
location: Bangalore
Metadata: {'source': 'data/struct_files/employees.csv', 'row': 0}


In [49]:
## Method 2: âœ… Custom CSV Processing â€” Full Control

import pandas as pd
from langchain_core.documents import Document

df = pd.read_csv(
    "data/struct_files/employees.csv",
    encoding="utf-8"
)

documents = []

for _, row in df.iterrows():
    # Customize how text content is built
    content = f"""
    Employee Name: {row['name']}
    Role: {row['role']}
    Experience: {row['experience_years']} years
    Location: {row['location']}
    """

    # Customize metadata
    metadata = {
        "id": int(row["id"]),
        "source": "employees.csv",
        "department": row.get("department", "unknown")
    }

    documents.append(Document(page_content=content.strip(), metadata=metadata))

print(f"Created {len(documents)} custom documents")
print(documents[0])


Created 4 custom documents
page_content='Employee Name: Rahul
    Role: Data Analyst
    Experience: 2 years
    Location: Bangalore' metadata={'id': 1, 'source': 'employees.csv', 'department': 'unknown'}


## Excel Processing

### âœ… Method 1 â€” Pandas Excel Processing (Full Control â€” Recommended)

Use when:
- You want full control over text formatting
- Need custom metadata + cleaning / normalization
- Want to group / chunk data (customer, product, sheet, etc.)
- Need reliable processing for RAG pipelines
- Want fewer dependencies than loaders

ðŸ§  Think of it as: "You fully control content, metadata, and structure"

### âœ… Method 2 â€” UnstructuredExcelLoader (Simple & Quick)

Use when:
- Each row or element as a document is fine
- Default auto-parsed structure is acceptable
- You just want to load and test quickly
- Useful for small files / prototyping / demos
- Not much customization needed

ðŸ§  Think of it as: "Quick load, minimal control"

In [50]:
## Method 1: âœ… Custom Excel Processing â€” Full Control (Recommended)


df = pd.read_excel("data/struct_files/sales_sample.xlsx")

documents = []

for _, row in df.iterrows():
    # Customize text content
    content = f"""
    Order ID: {row['order_id']}
    Customer: {row['customer']}
    Product: {row['product']}
    Quantity: {row['quantity']}
    Price: â‚¹{row['price']}
    """

    # Customize metadata
    metadata = {
        "order_id": int(row["order_id"]),
        "source": "sales_sample.xlsx"
    }

    documents.append(
        Document(page_content=content.strip(), metadata=metadata)
    )

print(f"Created {len(documents)} Excel documents (Full Control)")
print(documents[0])


Created 4 Excel documents (Full Control)
page_content='Order ID: 101
    Customer: Ankit
    Product: Laptop
    Quantity: 1
    Price: â‚¹65000' metadata={'order_id': 101, 'source': 'sales_sample.xlsx'}


In [52]:
## Method 2: âš¡ UnstructuredExcelLoader â€” Simple & Quick (Less Control)

from langchain_community.document_loaders import UnstructuredExcelLoader

print("ExcelLoader - Auto Parsed Documents")

excel_loader = UnstructuredExcelLoader(
    file_path="data/struct_files/sales_sample.xlsx",
    mode="elements"
)

excel_docs = excel_loader.load()

print(f"Loaded {len(excel_docs)} documents (auto-generated)")
print("\nFirst document content:")
print(excel_docs[0].page_content)
print(excel_docs[0].metadata)


ExcelLoader - Auto Parsed Documents
Loaded 1 documents (auto-generated)

First document content:
order_id customer product quantity price 101 Ankit Laptop 1 65000 102 Riya Keyboard 2 2500 103 Mohit Monitor 1 12000 104 Sara Mouse 3 700
{'source': 'data/struct_files/sales_sample.xlsx', 'file_directory': 'data/struct_files', 'filename': 'sales_sample.xlsx', 'last_modified': '2026-01-06T16:30:14', 'page_name': 'Sheet1', 'page_number': 1, 'text_as_html': '<table><tr><td>order_id</td><td>customer</td><td>product</td><td>quantity</td><td>price</td></tr><tr><td>101</td><td>Ankit</td><td>Laptop</td><td>1</td><td>65000</td></tr><tr><td>102</td><td>Riya</td><td>Keyboard</td><td>2</td><td>2500</td></tr><tr><td>103</td><td>Mohit</td><td>Monitor</td><td>1</td><td>12000</td></tr><tr><td>104</td><td>Sara</td><td>Mouse</td><td>3</td><td>700</td></tr></table>', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Table', 'element_id': '4e988