# Retrieval Augmented Generation

In this tutorial, we start with the code you wrote for the starter example of RAG (Retrieval Augmentation Generation) and show you the most common ways you might want to customize it for your use case.

In [None]:
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextStreamer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)   
from llama_index.core import Settings, PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM

from llama_index.core.node_parser import SentenceSplitter
import torch

Replace model_name by the one you chose in previous practical exercises

In [None]:
model_name = "mistralai/Mistral-7B-v0.1"
device_map = {"": torch.cuda.current_device()}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, 
    cache_dir='../models/model/'
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    cache_dir="../models/"
)

print(f"{round(model.get_memory_footprint()/1_000_000_000,3)} GB memory used")


Reminder : Sentence or words embeddings model are mathematical vector representation of content and semantics. Sentence embeddings are particularly useful for document query to cluster, retrieve information or semantic search.

Test different sentence embeddings : 

- [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
- [dangvantuan/sentence-camembert-base](https://huggingface.co/dangvantuan/sentence-camembert-base)
- [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)
- [dangvantuan/sentence-camembert-large](https://huggingface.co/dangvantuan/sentence-camembert-large)

In [None]:
embeddings=LangchainEmbedding(
    # HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    HuggingFaceEmbeddings(model_name="dangvantuan/sentence-camembert-base")
    # HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
    # HuggingFaceEmbeddings(model_name="dangvantuan/sentence-camembert-large")
)

In [None]:
system_prompt = """[INST] <<SYS>>

                            Je veux que tu te comportes comme l'assistant documentaire de Michelin.
                            L'objectif est de rédiger des paragraphes très qualitatifs et 
                            pertinents. Ne partage pas de fausses informations.
                            Réponds toujours entièrement en français.
                            <<SYS>>
                            """

In [None]:
query_wrapper_prompt = PromptTemplate(
    "Tu trouveras ci-dessous une instruction décrivant une tâche. "
    "Rédige une réponse qui complète la requête de manière appropriée."
    "### Instruction:\n{query_str}\n\n### Réponse:"
)

Play with [`generate_kwargs` parameters](https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate)

In [None]:
llm = HuggingFaceLLM(context_window=3500,
                    max_new_tokens=200,
                    generate_kwargs={"temperature" : 0.2,"pad_token_id":tokenizer.eos_token_id,"do_sample":True,"repetition_penalty":1.2},
                    system_prompt=system_prompt,
                    query_wrapper_prompt=query_wrapper_prompt,
                    model=model,
                    tokenizer=tokenizer)

In [None]:
Settings.llm = llm
Settings.embed_model = embeddings
Settings.chunk_size = 512
Settings.chunk_overlap = 128

Try to give path with your documents

In [None]:
data_path = "data_tp6"
documents = SimpleDirectoryReader(data_path).load_data()

parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(documents, show_progress=True)


for node in nodes : 
    print(node)
    print('--------')


In [None]:
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine(streaming=True,similarity_top_k=2)
response = query_engine.query("What did the author do growing up?")
response.print_response_stream()

“I want to parse my documents into smaller chunks”

In [None]:

Settings.chunk_size = 128

# Local settings
from llama_index.core.node_parser import SentenceSplitter

index = VectorStoreIndex.from_documents(
    documents, transformations=[SentenceSplitter(chunk_size=512)]
)

To learn more about all integrations available, check out [LlamaHub](https://llamahub.ai/).

“I want to use a different response mode”

In [None]:
query_engine = index.as_query_engine(response_mode="tree_summarize")
response = query_engine.query("What did the author do growing up?")
print(response)

“I want a chatbot instead of Q&A”

In [None]:
query_engine = index.as_chat_engine()
response = query_engine.chat("What did the author do growing up?")
print(response)