# Build RAG with Huggingface and Chroma
IN this notebook we build a RAG pipeline with Langchain. We use Huggingface models for encoding and generation. The example is structured as follows:
- [Load Data and Parameters](#load-data-and-params): Load PDF document (Bank of America FORM 10-q) and set parameters
- [Chunking](#chunking): Chunk document using `RecursiveCharacterTextSplitter`
- [Custom Embeddings](#custom-embeddings): Define a custom embedding strategy with `CLS pooling`
- [Indexing](#indexing): Index document into Chroma DB
- [Load Generative Model](#load-generative-model): Load generative model to answer query
- [Set up LLM chain](#set-up-llm-chain): Set chain using `PromptTemplate`, `HuggingFacePipeline` and `StrOutputparser`
- [Generation](#generation): Generate response based on retrieved context by leveraging generative model

In [None]:
!pip install --upgrade langchain langchain_community langchain-huggingface langchain-text-splitters langchain-chroma pypdf tqdm accelerate bitsandbytes python-dotenv

In [None]:
import os
import pathlib
import torch
from typing import List, Dict, Any

from tqdm.notebook import tqdm

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.base import Embeddings
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
)

from dotenv import load_dotenv
load_dotenv()

## Load Data and Params

In [None]:
%%bash
if [ ! -f "BAC_10Q.pdf" ]; then
    wget -q -O BAC_10Q.pdf https://investor.bankofamerica.com/regulatory-and-other-filings/all-sec-filings/content/0000070858-24-000208/0000070858-24-000208.pdf
fi

In [None]:
DATA_PATH = pathlib.Path.cwd()

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
TOP_K = 4

encoding_model = "BAAI/bge-base-en-v1.5"
PADDING = True
TRUNCATION = True

generative_model = "HuggingFaceH4/zephyr-7b-beta"
generation_kwargs = dict(
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

if os.getenv("USE_COLAB_SECRET", None):
  from google.colab import userdata
  os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')


## Chunking

In [None]:
loader = PyPDFLoader(DATA_PATH / "BAC_10Q.pdf")
docs = loader.load()
print(len(docs))

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunks = text_splitter.split_documents(docs)
text_lines = [chunk.page_content for chunk in chunks]

## Custom Embeddings

In [None]:
class CustomEmbeddings(Embeddings):
    def __init__(self, model_name: str, tokenizer_kwargs: Dict[str, Any]):
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.model = AutoModel.from_pretrained(model_name)
      self.tokenizer_kwargs = tokenizer_kwargs

    def encode(self, text):
      inputs = self.tokenizer(text, return_tensors="pt", **self.tokenizer_kwargs)
      with torch.no_grad():
          # CLS Pooling
          embeddings = self.model(**inputs).last_hidden_state[:, 0, :].cpu().numpy()
      return embeddings

    def embed_documents(self, documents: List[str]) -> List[List[float]]:
        return [self.encode(d)[0].tolist() for d in documents]

    def embed_query(self, query: str) -> List[float]:
        return self.encode([query])[0].tolist()

In [None]:
custom_emb = CustomEmbeddings(model_name=encoding_model, tokenizer_kwargs={"padding": PADDING, "truncation": TRUNCATION})

## Indexing

In [None]:
vectorStore = Chroma.from_documents(
    documents=chunks,
    collection_name="dens_vecs_1",
    embedding=custom_emb,
    persist_directory="./chroma_langchain_db"
)

## Load Generative Model

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

if torch.cuda.is_available():
  text_generation_model = AutoModelForCausalLM.from_pretrained(generative_model, quantization_config=bnb_config)
else:
  text_generation_model = AutoModelForCausalLM.from_pretrained(generative_model)
text_generation_tokenizer = AutoTokenizer.from_pretrained(generative_model)

## Set up LLM chain

In [None]:
text_generation_pipeline = pipeline(
    model=text_generation_model,
    tokenizer=text_generation_tokenizer,
    task="text-generation",
    **generation_kwargs
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

PROMPT = """
<|system|>
You are a smart assistant able to analyze companies' financial documents. Use the following pieces of information enclosed in <context> tags to provide an answer to the question. Return only the answer.
<context>
{context}
</context>
<|user|>
{question}
</s>
<|assistant|>

"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=PROMPT,
)

llm_chain = prompt | llm | StrOutputParser()

## Generation


In [None]:
def create_context(results: List[Any]) -> str:
    return "\n\n".join([res.page_content for res in results])

In [None]:
query = "What's the increase of Net income for Consumer Lending?"
results = vectorStore.similarity_search(query, k=TOP_K)
context = create_context(results)
rag_chain = {"context": lambda x: context, "question": RunnablePassthrough()} | llm_chain

answer = rag_chain.invoke(query)
print(answer)