# 1. Description of the project

In this project, a RAG system is implemented and used in combination with LettuceDetect.

# 2. Setup

1. **Install these packages:**

In [21]:
%pip install -qq langchain langchain-unstructured langchain-chroma langchain-openai unstructured unstructured[pdf] dotenv

Note: you may need to restart the kernel to use updated packages.


2. **Deploy an Azure OpenAI LLM resource and embedding resource**

    Use the following link: https://ai.azure.com/
3. **Save the details to the .env file:**
    ```bash
    echo AZURE_OPENAI_API_KEY=\"your-api-key-here\" >> .env
    echo AZURE_OPENAI_API_VERSION=\"your-version-here\" >> .env
    echo AZURE_OPENAI_ENDPOINT=\"your-endpoint-here\" >> .env
    echo GPT_MODEL=\"your-llm-model-here\" >> .env
    echo EMBEDDINGS_MODEL_NAME=\"your-embeddings-model-here\" >> .env
    echo EMBEDDINGS_DEPLOYMENT=\"your-embeddings-deployment-here\" >> .env
    ```

# 3. ChromaDB setup

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_unstructured import UnstructuredLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain_chroma.vectorstores import Chroma
import os
from dotenv import load_dotenv, find_dotenv

## 3.1 The text splitter

The text splitter divides documents into manageable chunks to optimize downstream processing and retrieval in RAG workflows.

In [23]:

def text_splitter(data, debug = False):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=50,
        length_function=len,
    )
    if debug:
        print(f"Splitting {len(data)} documents into chunks...")
    chunks = text_splitter.split_documents(data)
    return chunks

## 3.2 The document loader

The document loader reads and parses files from the corpus directory into structured document objects for downstream processing.

In [24]:
def load_documents(corpus_dir = "./corpus/", debug = False):
    loaded_docs = []
    if debug:
        print(f"Loading documents from {corpus_dir}...")
    for file in os.listdir(corpus_dir):
        if debug:
            print(f"Loading {file}...")
        loader = UnstructuredLoader(corpus_dir + file, mode = 'single')
        loaded_docs.extend(loader.load())
    return loaded_docs

## 3.3 The embedding client

The embedding client initializes and manages Azure OpenAI embeddings for converting text into vector representations.

In [25]:
def embeddings(debug = False):
    load_dotenv(find_dotenv())
    model = os.getenv('EMBEDDINGS_MODEL_NAME')
    api_key = os.getenv('AZURE_OPENAI_API_KEY')
    api_version = os.getenv("AZURE_OPENAI_API_VERSION")
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    azure_deployment = os.getenv("EMBEDDINGS_DEPLOYMENT")

    # Validate required environment variables
    if not all([model, api_key, api_version, azure_endpoint, azure_deployment]):
        raise ValueError(
            """
            Missing environment variables.
            Please load all the required environment variables in the .env file:
            EMBEDDINGS_MODEL_NAME, AZURE_OPENAI_API_KEY, AZURE_OPENAI_API_VERSION,
            AZURE_OPENAI_ENDPOINT, EMBEDDINGS_DEPLOYMENT
            """
        )
    
    # Initialize and return an Azure OpenAI embeddings client
    if debug:
        print(f"Initializing embeddings with model: {model}, deployment: {azure_deployment}")
    embeddings = AzureOpenAIEmbeddings(
        model = model,
        api_key = api_key,
        api_version = api_version,
        azure_endpoint = azure_endpoint,
        azure_deployment = azure_deployment,
    )
    return embeddings

## 3.4 The vector database

The vector database stores document embeddings for fast similarity search and retrieval. Built with Chroma, it enables efficient access to relevant document chunks in RAG workflows.

In [26]:
def create_database(document_list, database_dir = "./chroma_db", debug = False):
    # Initialize the database from a given corpus of documents
    embedding_model = embeddings(debug = debug)
    if debug:
        print(f"Creating vector database with {len(document_list)} documents...")
    vector_database = Chroma.from_documents(documents = document_list,
                                            embedding = embedding_model,
                                            persist_directory = database_dir)
    # Persist the vector database to disk
    vector_database.persist()

    # Return the vector database instance
    return Chroma(persist_directory = database_dir,
                  embedding_function = embedding_model)

## 3.5 The retriever

The retriever fetches relevant document chunks from the vector database using embeddings to match user queries with semantically similar content for efficient retrieval in RAG workflows.

In [27]:
def retriever(corpus_dir = "./corpus/", debug = False):
    docs = load_documents(corpus_dir, debug = debug)
    chunks = text_splitter(docs, debug = debug)
    vectordb = create_database(chunks, debug = debug)
    if debug:
        print("Creating retriever from vector database...")
    retriever = vectordb.as_retriever()
    return retriever

## 3.6 Small test

In [31]:
retriever_db = retriever(debug = True)

Loading documents from ./corpus/...
Loading ford_f150_lightning_2024.txt...
Loading audi_a4_2024.txt...
Loading mercedes_cclass_2024.txt...
Loading tesla_model_s_2024.txt...
Loading toyota_camry_2024.txt...
Loading bmw_3series_2024.txt...
Loading jeep_wrangler_2024.txt...
Loading porsche_911_carrera_2024.txt...
Loading subaru_outback_2024.txt...
Loading honda_accord_2024.txt...
Splitting 271 documents into chunks...
Initializing embeddings with model: text-embedding-3-small, deployment: text-embedding-3-small
Creating vector database with 271 documents...


INFO: HTTP Request: POST https://paulo-mcw0r95x-eastus2.cognitiveservices.azure.com/openai/deployments/text-embedding-3-small/embeddings?api-version=2024-12-01-preview "HTTP/1.1 200 OK"


ValueError: Expected metadata value to be a str, int, float, bool, or None, got ['eng'] which is a list in upsert.

Try filtering complex metadata from the document using langchain_community.vectorstores.utils.filter_complex_metadata.