# Exercise: Retrieval-Augmented Generation (RAG) with LangChain and Llama

This notebook demonstrates how to build a simple RAG pipeline using `langchain`, `llama-cpp-python`, and `Chroma`.  
It follows these steps:

1. **Install Dependencies**: Ensures all required packages are installed.
2. **Load Models**: Downloads an LLM and an embedding model from Hugging Face.
3. **Process Documents**: Fetches a blog post, extracts relevant content, splits it into chunks, and indexes it using a vector store.
4. **Set Up RAG Pipeline**: Defines a retriever to fetch relevant content and integrates it with a language model to generate responses.
5. **Test the RAG Pipeline**: Queries the model to retrieve and summarize information.

Run the final cell to see how the system responds to a test query.

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Install required dependencies
!pip3 install llama-cpp-python==0.3.4 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
!pip3 install huggingface_hub==0.28.0
!pip3 install langchain==0.3.17 langchain-core==0.3.33 langchain-community==0.3.14
!pip3 install langchain-chroma==0.2.0 langchain_huggingface==0.1.2

In [None]:
import bs4

from huggingface_hub import hf_hub_download
from langchain_community.chat_models import ChatLlamaCpp
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
# Download the LLM, you can search in Hugging Face
model_path = hf_hub_download(
    repo_id="Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF",
    filename="qwen2.5-coder-0.5b-instruct-q4_k_m.gguf",
    force_download=False,
)

In [None]:
# Create the LLM
llm = ChatLlamaCpp(
    model_path=model_path,
    n_gpu_layers=25,
    stop=["<|im_end|>\n"],
    n_ctx=8000,
    max_tokens=8000,
    streaming=True,
    n_batch=256,
)

In [None]:
# Create the embedding
embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

In [None]:
# Load, chunk and index the contents of the blog
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(class_=("post-content", "post-title", "post-header"))
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [None]:
# Create a vector store using Chroma and index the document chunks
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding)

# Create the retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 20})

In [None]:
# Create the prompt
prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessage("You are an AI assistant that answer questions briefly."),
        HumanMessagePromptTemplate.from_template(
            "Taking into account the following information:{context}\n\n{question}"
        ),
    ]

)

In [None]:
# Create a format function
def format_docs(docs):
    formated_docs = ""

    for d in docs:
        formated_docs += f"\n\n\t- {d.page_content}"

    return formated_docs


# Create the RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# Query the RAG system and stream the response
for c in rag_chain.stream("What is Task Decomposition?"):
    print(c, flush=True, end="")