In [1]:
import getpass
import os
import pandas as pd
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import OpenAI
import csv
from datetime import datetime

In [2]:
openai_api_key = os.getenv('OPENAI_API_KEY')

## Generate Document

In [3]:
# Function to convert data types based on the column
def convert_data(value, column):
    if column in ['Refund Requested', 'Notification Status']:
        return value.lower() in ['true', '1', 't', 'yes']
    elif column in ['Tracking Number']:
        return int(value) if value else 0  # Default to 0 if None
    elif column in ['Shipment Date', 'Expected Delivery Date', 'Delivery Date', 'Order Date', 'Refund Date']:
        if value:
            date = datetime.strptime(value, '%Y-%m-%d')
            return {"year": date.year, "month": date.month, "day": date.day}
        else:
            return {"year": 0, "month": 0, "day": 0}  # Default to 0 if None
    else:
        return value if value is not None else ""  # Default to empty string if None

# Load the mock dataset
df = pd.read_csv('data/mock_dataset.csv')

# Define the columns we want to embed vs which ones we want in metadata
columns_to_embed = ["Product", "Description"]
columns_to_metadata = ['Order ID', 'Tracking Number', 'Shipment Date',
                       'Expected Delivery Date', 'Delivery Date', 'Status', 'Customer ID',
                       'Customer Name', 'Customer Email', 'Customer Phone', 'Order Date',
                       'Refund Requested', 'Refund Reason', 'Refund Status', 'Refund Date',
                       'Notification Status']

In [4]:
docs = []
with open('data/mock_dataset.csv', newline="", encoding='utf-8-sig') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    for i, row in enumerate(csv_reader):
        # Convert all metadata to the appropriate types
        metadata = {}
        for col in columns_to_metadata:
            if col in ['Shipment Date', 'Expected Delivery Date', 'Delivery Date', 'Order Date', 'Refund Date']:
                date_info = convert_data(row[col], col)
                for date_part in ['year', 'month', 'day']:
                    metadata[col + " " + date_part] = date_info[date_part]  # Append year, month, day separately
            else:
                metadata[col] = convert_data(row[col], col)

        values_to_embed = {k: row[k].strip() for k in columns_to_embed if k in row}
        to_embed = "\n".join(f"{k}: {v}" for k, v in values_to_embed.items())
        newDoc = Document(page_content=to_embed, metadata=metadata)
        docs.append(newDoc)
docs[0]

Document(page_content='Product: Designer Eyeglasses\nDescription: Indie-style frames, ideal for the modern thinker.', metadata={'Order ID': 'ORD1000', 'Tracking Number': 123456789, 'Shipment Date year': 2024, 'Shipment Date month': 5, 'Shipment Date day': 10, 'Expected Delivery Date year': 2024, 'Expected Delivery Date month': 5, 'Expected Delivery Date day': 19, 'Delivery Date year': 2024, 'Delivery Date month': 5, 'Delivery Date day': 28, 'Status': 'cancel', 'Customer ID': 'CUST1000', 'Customer Name': 'William Ramirez', 'Customer Email': 'dana83@example.com', 'Customer Phone': '408.218.3405', 'Order Date year': 2024, 'Order Date month': 3, 'Order Date day': 27, 'Refund Requested': True, 'Refund Reason': 'Damaged Item', 'Refund Status': 'approved', 'Refund Date year': 2024, 'Refund Date month': 6, 'Refund Date day': 3, 'Notification Status': False})

## Delete previous ChromaDB

In [5]:
import shutil

# Delete the entire directory
shutil.rmtree('./data/chromadb3/')


## Store Document to DB

In [6]:
persist_directory = './data/chromadb3/'
# Generate embeddings from documents and store in a vector database
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(docs, embeddings, persist_directory=persist_directory)

## SelfQ

In [7]:
from config import metadata_field_info, document_content_description

In [24]:
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(llm, db, document_content_description, metadata_field_info, verbose=True,  enable_limit=True)

In [25]:
retriever.get_relevant_documents("give me documentation about Customer CUST1034 ")

[Document(page_content='Product: Sports Eyeglasses\nDescription: Durable frames designed for athletic use.', metadata={'Customer Email': 'renee52@example.net', 'Customer ID': 'CUST1034', 'Customer Name': 'Joseph Stafford', 'Customer Phone': '963-748-4179x64561', 'Delivery Date day': 1, 'Delivery Date month': 5, 'Delivery Date year': 2024, 'Expected Delivery Date day': 22, 'Expected Delivery Date month': 4, 'Expected Delivery Date year': 2024, 'Notification Status': False, 'Order Date day': 19, 'Order Date month': 3, 'Order Date year': 2024, 'Order ID': 'ORD1034', 'Refund Date day': 7, 'Refund Date month': 5, 'Refund Date year': 2024, 'Refund Reason': 'Damaged Item', 'Refund Requested': True, 'Refund Status': 'approved', 'Shipment Date day': 19, 'Shipment Date month': 4, 'Shipment Date year': 2024, 'Status': 'cancel', 'Tracking Number': 123456823})]

In [28]:
retriever.get_relevant_documents("give me 10 documentation about Status cancel, and Shipment Date year is 2024, and Shipment Date month is 4")

[Document(page_content='Product: Sun Protection Eyeglasses\nDescription: Features UV protection for sunny days.', metadata={'Customer Email': 'dananthony@example.org', 'Customer ID': 'CUST1020', 'Customer Name': 'Destiny Carter', 'Customer Phone': '+1-524-226-6732x46205', 'Delivery Date day': 30, 'Delivery Date month': 4, 'Delivery Date year': 2024, 'Expected Delivery Date day': 24, 'Expected Delivery Date month': 4, 'Expected Delivery Date year': 2024, 'Notification Status': True, 'Order Date day': 10, 'Order Date month': 4, 'Order Date year': 2024, 'Order ID': 'ORD1020', 'Refund Date day': 6, 'Refund Date month': 5, 'Refund Date year': 2024, 'Refund Reason': 'Damaged Item', 'Refund Requested': True, 'Refund Status': 'approved', 'Shipment Date day': 20, 'Shipment Date month': 4, 'Shipment Date year': 2024, 'Status': 'cancel', 'Tracking Number': 123456809}),
 Document(page_content='Product: Sports Eyeglasses\nDescription: Durable frames designed for athletic use.', metadata={'Customer 

In [33]:
retriever.get_relevant_documents("give me 10 documentation about Status cancel, and Shipment Date year is 2024, and Refund month is May")

[Document(page_content='Product: Designer Eyeglasses\nDescription: Indie-style frames, ideal for the modern thinker.', metadata={'Customer Email': 'craighoward@example.com', 'Customer ID': 'CUST1028', 'Customer Name': 'Emily Butler', 'Customer Phone': '(426)863-8426', 'Delivery Date day': 16, 'Delivery Date month': 5, 'Delivery Date year': 2024, 'Expected Delivery Date day': 11, 'Expected Delivery Date month': 5, 'Expected Delivery Date year': 2024, 'Notification Status': False, 'Order Date day': 27, 'Order Date month': 3, 'Order Date year': 2024, 'Order ID': 'ORD1028', 'Refund Date day': 22, 'Refund Date month': 5, 'Refund Date year': 2024, 'Refund Reason': 'Defective Item', 'Refund Requested': True, 'Refund Status': 'approved', 'Shipment Date day': 2, 'Shipment Date month': 5, 'Shipment Date year': 2024, 'Status': 'cancel', 'Tracking Number': 123456817}),
 Document(page_content='Product: Designer Eyeglasses\nDescription: Indie-style frames, ideal for the modern thinker.', metadata={'