### Parsing CSV And Excel Files

In [1]:
import os
import pandas as pd
from langchain_community.document_loaders import CSVLoader, UnstructuredCSVLoader

In [2]:
os.makedirs('data/structured_files', exist_ok=True)

In [3]:
# Create a data in dictionary format

data = {
    'Product': [
        'Laptop Pro', 'Wireless Mouse', 'Mechanical Keyboard', 'Gaming Monitor',
        'Bluetooth Headphones', 'USB-C Hub', 'External SSD', 'Webcam HD',
        'Smartphone X', 'Tablet Mini', 'Smart Watch', 'Wireless Charger',
        'Desktop Speaker', 'Printer Laser', 'Router WiFi6', 'Microphone Condenser',
        'Graphics Card', 'RAM 16GB', 'Power Bank', 'VR Headset'
    ],
    'Category': [
        'Computers', 'Accessories', 'Accessories', 'Displays',
        'Audio', 'Accessories', 'Storage', 'Accessories',
        'Mobile', 'Mobile', 'Wearables', 'Accessories',
        'Audio', 'Office', 'Networking', 'Audio',
        'Components', 'Components', 'Mobile', 'Gaming'
    ],
    'Price': [
        1299.99, 29.99, 89.99, 349.99,
        159.99, 49.99, 129.99, 79.99,
        899.99, 399.99, 249.99, 39.99,
        129.99, 299.99, 149.99, 199.99,
        499.99, 89.99, 59.99, 299.99
    ],
    'Stock': [
        15, 100, 45, 25,
        60, 75, 30, 50,
        20, 35, 40, 90,
        25, 18, 30, 15,
        12, 55, 80, 22
    ],
    'Description': [
        'High-performance laptop with 16GB RAM and 512GB SSD',
        'Ergonomic wireless mouse with 2.4GHz connectivity',
        'RGB mechanical keyboard with Cherry MX switches',
        '27-inch 144Hz gaming monitor with 1ms response time',
        'Noise-cancelling wireless headphones with 30hr battery',
        '7-in-1 USB-C hub with HDMI and Ethernet ports',
        '1TB external SSD with USB 3.2 Gen 2 connectivity',
        '1080p webcam with built-in microphone and autofocus',
        'Flagship smartphone with triple camera system',
        '8-inch tablet with stylus support and 128GB storage',
        'Fitness tracker with heart rate monitor and GPS',
        '15W fast wireless charging pad for Qi-enabled devices',
        'Stereo speaker system with Bluetooth 5.0',
        'Monochrome laser printer with wireless printing',
        'WiFi 6 router with mesh networking capabilities',
        'Studio-quality condenser microphone for recording',
        'High-end graphics card with 8GB GDDR6 memory',
        'DDR4 3200MHz memory module for desktop computers',
        '20000mAh portable power bank with PD charging',
        'Virtual reality headset with 6DOF tracking'
    ]
}

In [4]:
#Save as csv
df = pd.DataFrame(data)

df.to_csv('data/structured_files/products.csv', index=False)

In [5]:
df.head()

Unnamed: 0,Product,Category,Price,Stock,Description
0,Laptop Pro,Computers,1299.99,15,High-performance laptop with 16GB RAM and 512G...
1,Wireless Mouse,Accessories,29.99,100,Ergonomic wireless mouse with 2.4GHz connectivity
2,Mechanical Keyboard,Accessories,89.99,45,RGB mechanical keyboard with Cherry MX switches
3,Gaming Monitor,Displays,349.99,25,27-inch 144Hz gaming monitor with 1ms response...
4,Bluetooth Headphones,Audio,159.99,60,Noise-cancelling wireless headphones with 30hr...


In [6]:
# Create a dictionary data for electronics products
data_summary = {
    'Category': ['Smartphones', 'Laptops', 'Headphones', 'Tablets', 'Smart Watches', 
                'Gaming Consoles', 'TVs', 'Cameras', 'Printers', 'Monitors',
                'Speakers', 'Routers', 'External Hard Drives', 'Keyboards', 'Mice',
                'Power Banks', 'Chargers', 'Drones', 'Smart Home Devices', 'VR Headsets'],
    
    'Total items': [150, 85, 200, 75, 120, 45, 60, 55, 40, 70,
                   180, 95, 110, 160, 190, 130, 250, 35, 140, 25],
    
    'Total Values': [75000, 127500, 30000, 45000, 60000, 67500, 120000, 82500, 
                    20000, 35000, 27000, 28500, 55000, 24000, 19000, 39000, 
                    37500, 52500, 70000, 37500]
}

In [7]:
#Save the data_summary as excel
with pd.ExcelWriter('data/structured_files/inventory.xlsx') as writer:
    df.to_excel(writer, sheet_name='Products', index=False)


In [8]:
df = pd.DataFrame(data_summary)

df.to_excel(writer, sheet_name='Summary', index=False)

Processing CSV

In [None]:
df.head()
# Load the CSV file using CSVLoader
csv_loader = CSVLoader(file_path='data/structured_files/products.csv', # path to your CSV file
                       encoding='utf-8', # file encoding i.e. 'utf-8' means utf-8 encoding i.e. standard text encoding
                       csv_args={'delimiter': ',', 'quotechar': '"'}) # means comma separated values with double quotes for text i.e. "text"
csv_documents = csv_loader.load() # this loads the CSV file and creates a list of Document objects, one per row in the CSV file
print(f"Loaded {len(csv_documents)} documents (one per row) from CSV using CSVLoader.") # print the number of documents loaded
print('\nFirst document:' , csv_documents[0].page_content) # print the content of the first document
print('\n' + '-'*50 + '\n') # print a separator line
print(csv_documents[0].metadata)    # print the metadata of the first document (contains info like source file, row number, etc.)
print(csv_documents) # print all documents loaded

Loaded 20 documents (one per row) from CSV using CSVLoader.

First document: Product: Laptop Pro
Category: Computers
Price: 1299.99
Stock: 15
Description: High-performance laptop with 16GB RAM and 512GB SSD

--------------------------------------------------

{'source': 'data/structured_files/products.csv', 'row': 0}
[Document(metadata={'source': 'data/structured_files/products.csv', 'row': 0}, page_content='Product: Laptop Pro\nCategory: Computers\nPrice: 1299.99\nStock: 15\nDescription: High-performance laptop with 16GB RAM and 512GB SSD'), Document(metadata={'source': 'data/structured_files/products.csv', 'row': 1}, page_content='Product: Wireless Mouse\nCategory: Accessories\nPrice: 29.99\nStock: 100\nDescription: Ergonomic wireless mouse with 2.4GHz connectivity'), Document(metadata={'source': 'data/structured_files/products.csv', 'row': 2}, page_content='Product: Mechanical Keyboard\nCategory: Accessories\nPrice: 89.99\nStock: 45\nDescription: RGB mechanical keyboard with Cherry 

In [16]:
for doc in csv_documents:
    print(csv_documents.index(doc)) # print the index of the document
    print(doc.page_content)
    print(doc.metadata)
    print('\n' + '-'*80 + '\n') # print a separator line
    

0
Product: Laptop Pro
Category: Computers
Price: 1299.99
Stock: 15
Description: High-performance laptop with 16GB RAM and 512GB SSD
{'source': 'data/structured_files/products.csv', 'row': 0}

--------------------------------------------------------------------------------

1
Product: Wireless Mouse
Category: Accessories
Price: 29.99
Stock: 100
Description: Ergonomic wireless mouse with 2.4GHz connectivity
{'source': 'data/structured_files/products.csv', 'row': 1}

--------------------------------------------------------------------------------

2
Product: Mechanical Keyboard
Category: Accessories
Price: 89.99
Stock: 45
Description: RGB mechanical keyboard with Cherry MX switches
{'source': 'data/structured_files/products.csv', 'row': 2}

--------------------------------------------------------------------------------

3
Product: Gaming Monitor
Category: Displays
Price: 349.99
Stock: 25
Description: 27-inch 144Hz gaming monitor with 1ms response time
{'source': 'data/structured_files/pr

In [17]:
# Load the CSV file using UnstructuredCSVLoader
unstructured_csv_loader = UnstructuredCSVLoader(file_path='data/structured_files/products.csv', # path to your CSV file
                                               encoding='utf-8', # file encoding i.e. 'utf-8' means utf-8 encoding i.e. standard text encoding
                                               csv_args={'delimiter': ',', 'quotechar': '"'}) # means comma separated values with double quotes for text i.e. "text"
csv_documents = unstructured_csv_loader.load() # this loads the CSV file and creates a list of Document objects, one per row in the CSV file
print(f' Loaded {len(csv_documents)} documents (one per row) from CSV using UnstructuredCSVLoader.') # print the number of documents loaded
print('\nFirst document:' , csv_documents[0].page_content) # print the content of the first document
print('\n' + '-'*50 + '\n') # print a separator line
print(csv_documents[0].metadata)    # print the metadata of the first document (contains info like source file, row number, etc.)


print(csv_documents) # print all documents loaded


 Loaded 1 documents (one per row) from CSV using UnstructuredCSVLoader.

First document: Product Category Price Stock Description Laptop Pro Computers 1299.99 15 High-performance laptop with 16GB RAM and 512GB SSD Wireless Mouse Accessories 29.99 100 Ergonomic wireless mouse with 2.4GHz connectivity Mechanical Keyboard Accessories 89.99 45 RGB mechanical keyboard with Cherry MX switches Gaming Monitor Displays 349.99 25 27-inch 144Hz gaming monitor with 1ms response time Bluetooth Headphones Audio 159.99 60 Noise-cancelling wireless headphones with 30hr battery USB-C Hub Accessories 49.99 75 7-in-1 USB-C hub with HDMI and Ethernet ports External SSD Storage 129.99 30 1TB external SSD with USB 3.2 Gen 2 connectivity Webcam HD Accessories 79.99 50 1080p webcam with built-in microphone and autofocus Smartphone X Mobile 899.99 20 Flagship smartphone with triple camera system Tablet Mini Mobile 399.99 35 8-inch tablet with stylus support and 128GB storage Smart Watch Wearables 249.99 40 Fit

In [18]:
for doc in csv_documents:
    print(csv_documents.index(doc)) # print the index of the document
    print(doc.page_content)
    print(doc.metadata)
    print('\n' + '-'*80 + '\n') # print a separator line

0
Product Category Price Stock Description Laptop Pro Computers 1299.99 15 High-performance laptop with 16GB RAM and 512GB SSD Wireless Mouse Accessories 29.99 100 Ergonomic wireless mouse with 2.4GHz connectivity Mechanical Keyboard Accessories 89.99 45 RGB mechanical keyboard with Cherry MX switches Gaming Monitor Displays 349.99 25 27-inch 144Hz gaming monitor with 1ms response time Bluetooth Headphones Audio 159.99 60 Noise-cancelling wireless headphones with 30hr battery USB-C Hub Accessories 49.99 75 7-in-1 USB-C hub with HDMI and Ethernet ports External SSD Storage 129.99 30 1TB external SSD with USB 3.2 Gen 2 connectivity Webcam HD Accessories 79.99 50 1080p webcam with built-in microphone and autofocus Smartphone X Mobile 899.99 20 Flagship smartphone with triple camera system Tablet Mini Mobile 399.99 35 8-inch tablet with stylus support and 128GB storage Smart Watch Wearables 249.99 40 Fitness tracker with heart rate monitor and GPS Wireless Charger Accessories 39.99 90 15W 

In [23]:
# Method to load excel file using pandas and convert each sheet to csv and then load using CSVLoader
excel_file_path = 'data/structured_files/inventory.xlsx' # path to your excel file
xls = pd.ExcelFile(excel_file_path) # load the excel file
sheet_names = xls.sheet_names # get the sheet names
print(f'Sheet names in the Excel file: {sheet_names}') # print the sheet names
for sheet in sheet_names:
    df_sheet = pd.read_excel(xls, sheet_name=sheet) # read each sheet into a DataFrame
    csv_path = f'data/structured_files/{sheet}.csv' # define a path to save the CSV
    df_sheet.to_csv(csv_path, index=False) # save the DataFrame as a CSV file
    print(f'Saved sheet "{sheet}" as CSV at: {csv_path}') # print confirmation

    # Now load the saved CSV using CSVLoader
    csv_loader = CSVLoader(file_path=csv_path, encoding='utf-8', csv_args={'delimiter': ',', 'quotechar': '"'})
    csv_documents = csv_loader.load() # load the CSV file
    print(f'Loaded {len(csv_documents)} documents from sheet "{sheet}" using CSVLoader.') # print number of documents loaded
    print('\nFirst document from this sheet:' , csv_documents[5].page_content) # print first document content
    print('\n' + '-'*50 + '\n') # print a separator line
    print(csv_documents[5].metadata)    # print metadata of the first document

Sheet names in the Excel file: ['Products']
Saved sheet "Products" as CSV at: data/structured_files/Products.csv
Loaded 20 documents from sheet "Products" using CSVLoader.

First document from this sheet: Product: USB-C Hub
Category: Accessories
Price: 49.99
Stock: 75
Description: 7-in-1 USB-C hub with HDMI and Ethernet ports

--------------------------------------------------

{'source': 'data/structured_files/Products.csv', 'row': 5}


In [25]:
# Create custom csv processing for better formatting
for doc in csv_documents:
    print(csv_documents.index(doc)) # print the index of the document
    print(doc.page_content)
    print(doc.metadata)
    print('\n' + '-'*80 + '\n') # print a separator line

0
Product: Laptop Pro
Category: Computers
Price: 1299.99
Stock: 15
Description: High-performance laptop with 16GB RAM and 512GB SSD
{'source': 'data/structured_files/Products.csv', 'row': 0}

--------------------------------------------------------------------------------

1
Product: Wireless Mouse
Category: Accessories
Price: 29.99
Stock: 100
Description: Ergonomic wireless mouse with 2.4GHz connectivity
{'source': 'data/structured_files/Products.csv', 'row': 1}

--------------------------------------------------------------------------------

2
Product: Mechanical Keyboard
Category: Accessories
Price: 89.99
Stock: 45
Description: RGB mechanical keyboard with Cherry MX switches
{'source': 'data/structured_files/Products.csv', 'row': 2}

--------------------------------------------------------------------------------

3
Product: Gaming Monitor
Category: Displays
Price: 349.99
Stock: 25
Description: 27-inch 144Hz gaming monitor with 1ms response time
{'source': 'data/structured_files/Pr

In [28]:
# Method 2: Create  custom csv processing for better control over formatting
print("Custom CSV Processing:\n")

def custom_csv_processing(file_path):
    with open(file_path, 'r', encoding='utf-8') as f: # open the CSV file in read mode with utf-8 encoding
        lines = f.readlines() # read all lines from the file
    
    documents = [] # list to hold processed documents
    headers = lines[0].strip().split(',')  # Extract headers from the first line
    
    for i, line in enumerate(lines[1:]):  # Skip header line
        values = line.strip().split(',') # Split each line into values
        content = "\n".join([f"{header}: {value}" for header, value in zip(headers, values)]) # Format content as "Header: Value"
        metadata = {
            'source': file_path, # source file path
            'row': i + 2  # +2 to account for header and 0-based index
        }
        documents.append({'content': content, 'metadata': metadata}) # append the document to the list i.e. content and metadata were added to the list
    
    return documents # return the list of documents

custom_documents = custom_csv_processing('data/structured_files/products.csv') # call the custom processing function
print(f"Custom processed {len(custom_documents)} documents from CSV.") # print number of documents
for doc in custom_documents:
    print(custom_documents.index(doc)) # print the index of the document
    print(doc['content']) # print the content of the document
    print(doc['metadata']) # print the metadata of the document
    print('\n' + '-'*80 + '\n') # print a separator line


Custom CSV Processing:

Custom processed 20 documents from CSV.
0
Product: Laptop Pro
Category: Computers
Price: 1299.99
Stock: 15
Description: High-performance laptop with 16GB RAM and 512GB SSD
{'source': 'data/structured_files/products.csv', 'row': 2}

--------------------------------------------------------------------------------

1
Product: Wireless Mouse
Category: Accessories
Price: 29.99
Stock: 100
Description: Ergonomic wireless mouse with 2.4GHz connectivity
{'source': 'data/structured_files/products.csv', 'row': 3}

--------------------------------------------------------------------------------

2
Product: Mechanical Keyboard
Category: Accessories
Price: 89.99
Stock: 45
Description: RGB mechanical keyboard with Cherry MX switches
{'source': 'data/structured_files/products.csv', 'row': 4}

--------------------------------------------------------------------------------

3
Product: Gaming Monitor
Category: Displays
Price: 349.99
Stock: 25
Description: 27-inch 144Hz gaming mon