In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain import hub
from langchain_community.llms import Ollama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from typing import List, Dict, Any, Optional
from typing import Any, Dict, List
from langchain_chroma import Chroma
from langchain_core.vectorstores import VST
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
from typing import Any
from langchain_core.language_models.llms import BaseLLM
from langchain_community.llms.ollama import Ollama
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def load_pdf_document(file_path: str, **kwargs: Dict[str, Any]) -> List[Document]:
    """
    Loads PDF document.

    Args:
        file_path (str): File path
        **kwargs: Keyword arguments

    Returns:
        List[Document]: List of documents

    Raises:
        Exception: If failed to load PDF document
    """
    try:
        # Instantiate loader
        loader = PyPDFLoader(file_path=file_path, **kwargs)
        # Load document
        document = loader.load()
    except Exception as e:
        raise Exception(f"Failed to load PDF document: {e}")
    return document


def split_documents(documents: List[Document], chunk_size:int=1000, chunk_overlap:int=200, **kwargs) -> List[Document]:
    """
    Split documents into chunks.

    Args:
        documents (List[Document]): List of documents
        chunk_size (int): Size of the chunk
        chunk_overlap (float): Overlap between chunks
    
    Returns:
        List[Document]: List of documents with chunks.
    """
    try:
        # Instantiate text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=True, **kwargs
            )
        # Split text
        splits = text_splitter.split_documents(documents)
    except Exception as e:
        raise Exception(f"Failed to split documents: {e}")

    return splits

def load_embeddings_model_hf(model_name: Optional[str]="sentence-transformers/all-MiniLM-L6-v2") -> Embeddings:
    """
    load embeddings model.

    Args:
        model_name (str): Model name
    
    Returns:
        Embeddings: Embeddings model
    """
    try:
        # Instantiate embeddings
        embeddings_model = HuggingFaceEmbeddings(model_name=model_name, show_progress=True)
    except Exception as e:
        raise Exception(f"Failed to load embeddings model: {e}")
    
    return embeddings_model

def load_chroma_vectorstore(documents: List[Document], embeddings_model: Embeddings, **kwargs: Dict[str, Any]) -> VST:
    """
    Loads documents into Chromadb.

    Args:
        documents (List[Document]): List of documents
        embeddings_model (Embeddings): Embeddings model
        **kwargs: Keyword arguments

    Returns:
        VST: Vector store
    """
    try:
        # Load vectorstore
        vectorstore = Chroma.from_documents(
            documents=documents, embedding=embeddings_model, **kwargs
        )
    except Exception as e:
        raise Exception(f"Failed to load vectorstore: {e}")

    return vectorstore

def load_retriever(vectorstore: VectorStore, **kwargs: Dict[str, Any]) -> VectorStoreRetriever:
    """
    Loads retriever from vectorstore.

    Args:
        vectorstore (VectorStore): Vector store

    Returns:
        VectorStoreRetriever: Vector store retriever
    """
    try:
        # Load retriever
        retriever = vectorstore.as_retriever(**kwargs)
    except Exception as e:
        raise Exception(f"Failed to load retriever: {e}")

    return retriever


def load_llm_ollama(model_name:str='llama3:instruct', base_url:str=None, **kwargs) -> BaseLLM:
    """
    Load large language model from Ollama.

    Args:
        model_name (str): The name of the model to load
        pipeline_kwargs Optional(Dict[str, Any]): The pipeline actions.

    Returns:
        BaseLLM: The loaded language model.
    
    Raises:
        ValueError: If there is an error loading the model
    """
    try:
        llm = Ollama(model=model_name, base_url=base_url, **kwargs )
    except Exception as e:
        raise ValueError(f"Error loading model {model_name}: {e}") from e        
    
    return llm


In [12]:
PATH = "./data/responsible-use-guide-pdf.pdf"

# Load document
documents = load_pdf_document(PATH)

# Split document
splits = split_documents(documents)

# Load embeddings model
embeddings_model = load_embeddings_model_hf()


# Load vectorstore
vectorstore = load_chroma_vectorstore(
    documents=splits, embeddings_model=embeddings_model
)

# Load retriever
retriever = load_retriever(vectorstore)



  warn_deprecated(
Batches: 100%|██████████| 3/3 [00:02<00:00,  1.28it/s]


In [13]:
PROMPT = "How to use the guide"

# Load documents
docs = retriever.invoke(PROMPT)

docs

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.78s/it]


[Document(page_content='Responsible \nUse Guide\nResources and best practices for \nresponsible development of products \nbuilt with large language models\nMeta Llama', metadata={'page': 0, 'source': './data/responsible-use-guide-pdf.pdf', 'start_index': 0}),
 Document(page_content='The recommendations included in this guide reflect \ncurrent research on responsible generative AI. We \nexpect these to evolve as the field advances and \naccess to foundation models grows, inviting further \ninnovation on AI safety. Decisions to implement \nbest practices should be evaluated based on the \njurisdiction where your products will be deployed and \nshould follow your company’s internal legal and risk \nmanagement processes.How to use this guide\nThis guide is a resource for developers that outlines \ncommon approaches to building responsibly at each \nlevel of an LLM-powered product. It covers best \npractices and considerations that developers should \nevaluate in the context of their specif

In [14]:
# Use retrieved documents as context
context = "\n\n".join([doc.page_content for doc in docs])

context

'Responsible \nUse Guide\nResources and best practices for \nresponsible development of products \nbuilt with large language models\nMeta Llama\n\nThe recommendations included in this guide reflect \ncurrent research on responsible generative AI. We \nexpect these to evolve as the field advances and \naccess to foundation models grows, inviting further \ninnovation on AI safety. Decisions to implement \nbest practices should be evaluated based on the \njurisdiction where your products will be deployed and \nshould follow your company’s internal legal and risk \nmanagement processes.How to use this guide\nThis guide is a resource for developers that outlines \ncommon approaches to building responsibly at each \nlevel of an LLM-powered product. It covers best \npractices and considerations that developers should \nevaluate in the context of their specific use case and \nmarket. It also highlights some mitigation strategies \nand resources available to developers to address risks \nat var

In [15]:
from langchain_core.prompts import ChatPromptTemplate

RAG_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

# Generate prompt
prompt_template = ChatPromptTemplate.from_template(RAG_TEMPLATE)
rag_prompt = prompt_template.format(context=context, question=PROMPT)

rag_prompt


'Human: \nAnswer the question based only on the following context:\n\nResponsible \nUse Guide\nResources and best practices for \nresponsible development of products \nbuilt with large language models\nMeta Llama\n\nThe recommendations included in this guide reflect \ncurrent research on responsible generative AI. We \nexpect these to evolve as the field advances and \naccess to foundation models grows, inviting further \ninnovation on AI safety. Decisions to implement \nbest practices should be evaluated based on the \njurisdiction where your products will be deployed and \nshould follow your company’s internal legal and risk \nmanagement processes.How to use this guide\nThis guide is a resource for developers that outlines \ncommon approaches to building responsibly at each \nlevel of an LLM-powered product. It covers best \npractices and considerations that developers should \nevaluate in the context of their specific use case and \nmarket. It also highlights some mitigation strateg