In [1]:
import csv
from langchain.docstore.document import Document 
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import pandas as pd
import os

In [2]:

# Load the mock dataset
df = pd.read_csv('data/mock_dataset.csv')
df.columns

Index(['Order ID', 'Tracking Number', 'Shipment Date',
       'Expected Delivery Date', 'Delivery Date', 'Status', 'Customer ID',
       'Customer Name', 'Customer Email', 'Customer Phone', 'Order Date',
       'Refund Requested', 'Refund Reason', 'Refund Status', 'Refund Date',
       'Notification Status', 'Product', 'Description'],
      dtype='object')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Order ID                50 non-null     object
 1   Tracking Number         50 non-null     int64 
 2   Shipment Date           39 non-null     object
 3   Expected Delivery Date  39 non-null     object
 4   Delivery Date           19 non-null     object
 5   Status                  50 non-null     object
 6   Customer ID             50 non-null     object
 7   Customer Name           50 non-null     object
 8   Customer Email          50 non-null     object
 9   Customer Phone          50 non-null     object
 10  Order Date              50 non-null     object
 11  Refund Requested        50 non-null     bool  
 12  Refund Reason           17 non-null     object
 13  Refund Status           17 non-null     object
 14  Refund Date             12 non-null     object
 15  Notifica

In [4]:
# Define the columns we want to embed vs which ones we want in metadata
columns_to_embed = ["Product","Description"]
columns_to_metadata = ['Order ID', 'Tracking Number', 'Shipment Date',
       'Expected Delivery Date', 'Delivery Date', 'Status', 'Customer ID',
       'Customer Name', 'Customer Email', 'Customer Phone', 'Order Date',
       'Refund Requested', 'Refund Reason', 'Refund Status', 'Refund Date',
       'Notification Status']

In [5]:
# Process the CSV into the embedable content vs the metadata and put it into Document format so that we can chunk it into pieces.
docs = []
with open('data/mock_dataset.csv', newline="", encoding='utf-8-sig') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    for i, row in enumerate(csv_reader):
        to_metadata = {col: row[col] for col in columns_to_metadata if col in row}
        values_to_embed = {k: row[k] for k in columns_to_embed if k in row}
        to_embed = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in values_to_embed.items())
        newDoc = Document(page_content=to_embed, metadata=to_metadata)
        docs.append(newDoc)

In [6]:
# Lets split the document using Chracter splitting. 
splitter = CharacterTextSplitter(separator = "\n",
                                chunk_size=500, 
                                chunk_overlap=0,
                                length_function=len)
documents = splitter.split_documents(docs)

In [7]:
documents[34]

Document(page_content='Product: Sports Eyeglasses\nDescription: Durable frames designed for athletic use.', metadata={'Order ID': 'ORD1034', 'Tracking Number': '123456823', 'Shipment Date': '2024-04-19', 'Expected Delivery Date': '2024-04-22', 'Delivery Date': '2024-05-01', 'Status': 'cancel', 'Customer ID': 'CUST1034', 'Customer Name': 'Joseph Stafford', 'Customer Email': 'renee52@example.net', 'Customer Phone': '963-748-4179x64561', 'Order Date': '2024-03-19', 'Refund Requested': 'True', 'Refund Reason': 'Damaged Item', 'Refund Status': 'approved', 'Refund Date': '2024-05-07', 'Notification Status': 'False'})

In [8]:
persist_directory = './data/chromadb3/'
# Generate embeddings from documents and store in a vector database
embeddings_model = OpenAIEmbeddings()
db = Chroma.from_documents(documents, OpenAIEmbeddings(),persist_directory=persist_directory)

  warn_deprecated(


In [9]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [10]:
metadata_field_info = [
    AttributeInfo(
        name="Order ID",
        description="Unique identifier for each order",
        type="string",
    ),
    AttributeInfo(
        name="Tracking Number",
        description="Tracking number associated with the shipment of the order",
        type="integer",
    ),
    AttributeInfo(
        name="Shipment Date",
        description="The date on which the order was shipped",
        type="date",
    ),
    AttributeInfo(
        name="Expected Delivery Date",
        description="The estimated date on which the order is expected to be delivered",
        type="date",
    ),
    AttributeInfo(
        name="Delivery Date",
        description="The actual date on which the order was delivered",
        type="date",
    ),
    AttributeInfo(
        name="Status",
        description="Current status of the order (e.g., pending, delivered, cancelled)",
        type="string",
    ),
    AttributeInfo(
        name="Customer ID",
        description="Unique identifier for the customer who placed the order",
        type="string",
    ),
    AttributeInfo(
        name="Customer Name",
        description="Name of the customer who placed the order",
        type="string",
    ),
    AttributeInfo(
        name="Customer Email",
        description="Email address of the customer",
        type="string",
    ),
    AttributeInfo(
        name="Customer Phone",
        description="Phone number of the customer",
        type="string",
    ),
    AttributeInfo(
        name="Order Date",
        description="The date on which the order was placed",
        type="date",
    ),
    AttributeInfo(
        name="Refund Requested",
        description="Indicates whether a refund was requested for the order",
        type="boolean",
    ),
    AttributeInfo(
        name="Refund Reason",
        description="The reason provided by the customer for requesting a refund",
        type="string",
    ),
    AttributeInfo(
        name="Refund Status",
        description="Current status of the refund request (e.g., approved, rejected)",
        type="string",
    ),
    AttributeInfo(
        name="Refund Date",
        description="The date on which the refund was processed",
        type="date",
    ),
    AttributeInfo(
        name="Notification Status",
        description="Indicates whether notifications related to the order are active",
        type="boolean",
    )
]
document_content_description = "Customer order information including order details, customer details, refund information."


## SelfQ

In [12]:
# Configure retriver
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, db, document_content_description, metadata_field_info, verbose=True)

  warn_deprecated(


In [13]:
# Retrieve values
retriever.get_relevant_documents("give me documentation about Order ID 'ORD1034' ")

  warn_deprecated(


[Document(page_content='Product: Sports Eyeglasses\nDescription: Durable frames designed for athletic use.', metadata={'Customer Email': 'renee52@example.net', 'Customer ID': 'CUST1034', 'Customer Name': 'Joseph Stafford', 'Customer Phone': '963-748-4179x64561', 'Delivery Date': '2024-05-01', 'Expected Delivery Date': '2024-04-22', 'Notification Status': 'False', 'Order Date': '2024-03-19', 'Order ID': 'ORD1034', 'Refund Date': '2024-05-07', 'Refund Reason': 'Damaged Item', 'Refund Requested': 'True', 'Refund Status': 'approved', 'Shipment Date': '2024-04-19', 'Status': 'cancel', 'Tracking Number': '123456823'})]