In [None]:
## RAG Hands-on tutorial
"""
Step 1: Vector Store
Step 2: Search the vector store and retrieve relevant documents
Step 3: Call LLM with the user query and the retrieved documents
Step 4: Return the LLM response to the user
"""


In [14]:
import os
import json
import logging
import sys

from dotenv import load_dotenv
load_dotenv(override=True)

# create and configure logger
logging.basicConfig(level=logging.INFO, datefmt='%Y-%m-%dT%H:%M:%S',
                    format='%(asctime)-15s.%(msecs)03dZ %(levelname)-7s : %(name)s - %(message)s',
                    handlers=[logging.FileHandler("llm.log"), logging.StreamHandler(sys.stdout)])
# create log object with current module name
log = logging.getLogger(__name__)

In [2]:
%%sh
which python




/Users/minum/.pyenv/versions/3.12.0/bin/python


In [3]:
%%sh
conda create -n rag


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done




  current version: 4.12.0
  latest version: 24.5.0

Please update conda by running

    $ conda update -n base -c defaults conda





## Package Plan ##

  environment location: /Users/minum/opt/anaconda3/envs/rag



Proceed ([y]/n)? Invalid choice: conda activate rag
Proceed ([y]/n)? Invalid choice: which python
Proceed ([y]/n)? 
Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done
#
# To activate this environment, use
#
#     $ conda activate rag
#
# To deactivate an active environment, use
#
#     $ conda deactivate



In [4]:
%%sh
conda activate rag
which python


CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'.
To initialize your shell, run

    $ conda init <SHELL_NAME>

Currently supported shells are:
  - bash
  - fish
  - tcsh
  - xonsh
  - zsh
  - powershell

See 'conda init --help' for more information and options.

IMPORTANT: You may need to close and restart your shell after running 'conda init'.




/Users/minum/.pyenv/versions/3.12.0/bin/python


In [6]:
%%sh
pip install -r requirements.txt

Collecting langchain==0.1.7 (from -r requirements.txt (line 2))
  Obtaining dependency information for langchain==0.1.7 from https://files.pythonhosted.org/packages/3c/7c/138106d78cde8b994665251b0681ba59aede8e84902418bab8ea272f4585/langchain-0.1.7-py3-none-any.whl.metadata
  Using cached langchain-0.1.7-py3-none-any.whl.metadata (13 kB)
Collecting langchain-community==0.0.20 (from -r requirements.txt (line 3))
  Obtaining dependency information for langchain-community==0.0.20 from https://files.pythonhosted.org/packages/44/21/0c26e7f4cbea8ecc22c21dda8cca29a378b9d2795aebaa47ed40b130979d/langchain_community-0.0.20-py3-none-any.whl.metadata
  Using cached langchain_community-0.0.20-py3-none-any.whl.metadata (8.1 kB)
Collecting langchain-core==0.1.23 (from -r requirements.txt (line 4))
  Obtaining dependency information for langchain-core==0.1.23 from https://files.pythonhosted.org/packages/b1/e9/7e624fe4a7619821331ad2e943fbfc2eab7465cf97ee95158c435a276d3e/langchain_core-0.1.23-py3-none-an

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
marvin 2.3.3 requires openai>=1.21.0, but you have openai 1.12.0 which is incompatible.
langchain-experimental 0.0.57 requires langchain<0.2.0,>=0.1.15, but you have langchain 0.1.7 which is incompatible.
langchain-experimental 0.0.57 requires langchain-core<0.2.0,>=0.1.41, but you have langchain-core 0.1.23 which is incompatible.
langchain-text-splitters 0.0.1 requires langchain-core<0.2.0,>=0.1.28, but you have langchain-core 0.1.23 which is incompatible.
langgraph 0.0.37 requires langchain-core<0.2.0,>=0.1.42, but you have langchain-core 0.1.23 which is incompatible.[0m[31m


Successfully installed langchain-0.1.7 langchain-community-0.0.20 langchain-core-0.1.23 langsmith-0.0.87 openai-1.12.0


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [17]:
# data loaders
from langchain_community.document_loaders import CSVLoader, DataFrameLoader, PyPDFLoader, Docx2txtLoader, UnstructuredRSTLoader, DirectoryLoader


class DataLoaders:
    """
    specify all data loaders here
    """
    def __init__(self, data_dir_path):
        self.data_dir_path = data_dir_path
    
    def csv_loader(self):
        csv_loader_kwargs = {
                            "csv_args":{
                                "delimiter": ",",
                                "quotechar": '"',
                                },
                            }
        dir_csv_loader = DirectoryLoader(self.data_dir_path, glob="**/*.csv", use_multithreading=True,
                                    loader_cls=CSVLoader, 
                                    loader_kwargs=csv_loader_kwargs,
                                    )
        return dir_csv_loader
    
    def pdf_loader(self):
        dir_pdf_loader = DirectoryLoader(self.data_dir_path, glob="**/*.pdf",
                                    loader_cls=PyPDFLoader,
                                    )
        return dir_pdf_loader
    
    def word_loader(self):
        dir_word_loader = DirectoryLoader(self.data_dir_path, glob="**/*.docx",
                                    loader_cls=Docx2txtLoader,
                                    )
        return dir_word_loader
    
    def rst_loader(self):
        rst_loader_kwargs = {
                        "mode":"single"
                        }
        dir_rst_loader = DirectoryLoader(self.data_dir_path, glob="**/*.rst",
                                    loader_cls=UnstructuredRSTLoader, 
                                    loader_kwargs=rst_loader_kwargs
                                    )
        return dir_rst_loader
    
    
    def get_text_metadatas(csv_data=None, pdf_data=None, word_data=None, rst_data=None):
        """
        Format text and metadata content
        """
        csv_texts = [doc.page_content for doc in csv_data]
        csv_metadatas = [{'source': doc.metadata['source'], 'row_page': doc.metadata['row']} for doc in csv_data] # metadata={'source': 'filename.csv', 'row': 0}
        pdf_texts = [doc.page_content for doc in pdf_data]
        pdf_metadatas = [{'source': doc.metadata['source'], 'row_page': doc.metadata['page']} for doc in pdf_data]  # metadata={'source': 'data/filename.pdf', 'page': 8}
        word_texts = [doc.page_content for doc in word_data]
        word_metadatas = [{'source': doc.metadata['source'], 'row_page': ''} for doc in word_data] 
        rst_texts = [doc.page_content for doc in rst_data]
        rst_metadatas = [{'source': doc.metadata['source'], 'row_page': ''} for doc in rst_data]         # metadata={'source': 'docs/images/architecture/index.rst'}
        
        for doc in pdf_data:
            print(doc)
            break

        texts = csv_texts + pdf_texts + word_texts + rst_texts
        metadatas = csv_metadatas + pdf_metadatas + word_metadatas + rst_metadatas
        return texts, metadatas

In [18]:
# load data
#data_dir_path = os.getenv('DATA_DIR_PATH', "data")
data_dir_path ="docs"
data_loader = DataLoaders(data_dir_path=data_dir_path)
log.info("Loading files from directory %s", data_dir_path)
dir_csv_loader = data_loader.csv_loader()
dir_word_loader = data_loader.word_loader()
dir_pdf_loader = data_loader.pdf_loader()
dir_rst_loader = data_loader.rst_loader()
csv_data = dir_csv_loader.load()
word_data = dir_word_loader.load()
pdf_data = dir_pdf_loader.load()
rst_data = dir_rst_loader.load()
texts , metadatas = DataLoaders.get_text_metadatas(csv_data, pdf_data, word_data, rst_data)



2024-05-20T12:18:13.639Z INFO    : __main__ - Loading files from directory docs
2024-05-20T12:18:18.133Z INFO    : unstructured - Reading document from string ...
2024-05-20T12:18:18.134Z INFO    : unstructured - Reading document ...
2024-05-20T12:18:18.138Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:18.138Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:18.139Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:18.139Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:18.140Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:18.140Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:18.140Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:18.141Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:





2024-05-20T12:18:24.914Z INFO    : unstructured - Reading document from string ...
2024-05-20T12:18:24.915Z INFO    : unstructured - Reading document ...
2024-05-20T12:18:24.942Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:24.943Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:24.943Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:24.944Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:24.944Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:24.945Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:24.945Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:24.946Z INFO    : unstructured - HTML element instance has no attribute type
2024-05-20T12:18:24.947Z INFO    : unstructured - HTML element instance has no attribute type

In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from typing import List

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000,
        chunk_overlap=200,
        separators=[
            "\n\n", "\n", ". ", " ", ""
        ]  # try to split on paragraphs... fallback to sentences, then chars, ensure we always fit in context window
    )

docs: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas)


In [21]:
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
collection_name = os.getenv('QDRANT_COLLECTION_NAME', "data-collection")

# create vector Store
vectorstore = Qdrant.from_documents(
    documents=docs,
    embedding=embeddings,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name=collection_name,
    )

2024-05-20T12:44:20.161Z INFO    : httpx - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-05-20T12:44:21.003Z INFO    : httpx - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-05-20T12:44:21.946Z INFO    : httpx - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-05-20T12:44:23.284Z INFO    : httpx - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-05-20T12:44:24.038Z INFO    : httpx - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [23]:
from langchain import hub
from langchain_openai import ChatOpenAI

# Retrieve and generate using the relevant snippets
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm: ChatOpenAI = ChatOpenAI(
            temperature=0,
            model="gpt-4o",   # model="gpt-4-0125-preview",
            max_retries=500,
        )

In [24]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [25]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [26]:
rag_chain.invoke("What is Delta?")



2024-05-20T12:49:49.922Z INFO    : httpx - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-05-20T12:49:52.623Z INFO    : httpx - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'Delta is a high-performance computing system designed to help applications transition from CPU-only to GPU or hybrid CPU-GPU codes. It features a single processor architecture (AMD) across all node types and supports various GPU configurations, including NVIDIA A100 and A40 GPUs. Delta also includes advanced features like raytracing hardware support, large memory nodes, and a high-bandwidth interconnect.'

In [27]:
## adding sources
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [28]:
rag_chain_with_source.invoke("What is Delta?")

2024-05-20T12:53:11.407Z INFO    : httpx - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-05-20T12:53:13.391Z INFO    : httpx - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'context': [Document(page_content='Acknowledging Delta\n\nSee Delta\nCitations for information on how to properly acknowledge the NCSA\nDelta system or Delta Project/NCSA Staff.\n\nSee Acknowledging\nACCESS for information on how to properly acknowledge ACCESS.', metadata={'source': 'docs/acknowledge.rst', '_id': 'aae7813623014c46a37de26d2e7b3243', '_collection_name': 'data-collection'}),
  Document(page_content='Acknowledging Delta\n \nSee Delta Citations for information on how to properly acknowledge the NCSA Delta system or Delta\nProject/NCSA Staff.\nSee Acknowledging ACCESS for information on how to properly acknowledge ACCESS.', metadata={'source': 'docs/acknowledge.pdf', 'row_page': 0, '_id': '2fb1a78388074e39903ac497b8efe868', '_collection_name': 'data-collection'}),
  Document(page_content="System Architecture\n \nDelta is designed to help applications transition from CPU-only to GPU or hybrid CPU-GPU codes. Delta has\nsome important architectural features to facilitate new d