# Retrieval Augmented Generation

In this tutorial, we start with the code you wrote for the starter example of RAG (Retrieval Augmentation Generation) and show you the most common ways you might want to customize it for your use case.

In [3]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextStreamer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)   
from llama_index.core import Settings
from llama_index.llms.huggingface import HuggingFaceLLM

from llama_index.core.node_parser import SentenceSplitter
import torch

In [None]:
model_name = "mistralai/Mistral-7B-v0.1"
device_map = {"": torch.cuda.current_device()}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, 
    cache_dir='../models/model/'
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    cache_dir="../models/"
)

# model = AutoModelForCausalLM.from_pretrained(name,device_map=device_map,load_in_8bit=True,trust_remote_code=True) #,force_download=True, resume_download=False
print(f"{round(model.get_memory_footprint()/1_000_000_000,3)} GB memory used")

In [None]:
embeddings=LangchainEmbedding(
    # HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    HuggingFaceEmbeddings(model_name="dangvantuan/sentence-camembert-base")
    # HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
    # HuggingFaceEmbeddings(model_name="dangvantuan/sentence-camembert-large")
)   

In [None]:
Settings.llm = model
Settings.embed_model = embeddings
Settings.chunk_size = 512
Settings.chunk_overlap = 128

In [27]:
documents = SimpleDirectoryReader("data_tp6").load_data()

parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(documents, show_progress=True)


for node in nodes : 
    print(node)
    print('--------')


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00,  6.00it/s]

{<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='40c6de45-a08a-4a38-acc7-321c4b396dc0', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'c:\\Users\\a812364\\OneDrive - Atos\\Bureau\\Projects\\michelin\\formation_llm\\tp\\data_tp6\\paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-03-12', 'last_modified_date': '2024-03-12'}, hash='d8c7fe5e89e6132678cb68ffddcdb117b6c2687c269d1a4c1fb9e3a8533b69e8'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='6935a8dd-82cb-4515-b70f-e6f379062247', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='a86dcfe59f6bd34948c27f71ce3559236525c77d68fd0be377e2548e308f9ec0')}





In [None]:
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine(streaming=True,similarity_top_k=2)
response = query_engine.query("What did the author do growing up?")
response.print_response_stream()

“I want to parse my documents into smaller chunks”

In [None]:

Settings.chunk_size = 128

# Local settings
from llama_index.core.node_parser import SentenceSplitter

index = VectorStoreIndex.from_documents(
    documents, transformations=[SentenceSplitter(chunk_size=512)]
)

To learn more about all integrations available, check out [LlamaHub](https://llamahub.ai/).

“I want to use a different response mode”

In [None]:


query_engine = index.as_query_engine(response_mode="tree_summarize")
response = query_engine.query("What did the author do growing up?")
print(response)

“I want a chatbot instead of Q&A”

In [None]:
query_engine = index.as_chat_engine()
response = query_engine.chat("What did the author do growing up?")
print(response)

## Pour aller plus loin

Définir un PromptTemplate
Définir un SystemPrompt

In [None]:
system_prompt = """[INST] <<SYS>>

                            Je veux que tu te comportes comme l'assistant documentaire de Michelin.
                            L'objectif est de rédiger des paragraphes très qualitatifs et 
                            pertinents. Ne partage pas de fausses informations.
                            Réponds toujours entièrement en français.
                            <<SYS>>
                            """

In [None]:
query_wrapper_prompt = PromptTemplate(
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)

In [None]:
llm = HuggingFaceLLM(context_window=3500,
                    max_new_tokens=200,
                    generate_kwargs={"temperature" : 0.2,"pad_token_id":tokenizer.eos_token_id,"do_sample":True,"repetition_penalty":1.2},
                    system_prompt=system_prompt,
                    query_wrapper_prompt=query_wrapper_prompt,
                    model=model,
                    tokenizer=tokenizer)

In [None]:

Settings.llm = llm
Settings.embed_model = embeddings
Settings.chunk_size = 512
Settings.chunk_overlap = 128

In [31]:
from llama_index.core.evaluation import SemanticSimilarityEvaluator
import nest_asyncio