**import libraries**

# **Task 1 objective**:

our goal is to implement Retrieval-Augmented Generation (RAG) to enhance the model’s ability to generate accurate and contextually relevant responses by retrieving information from a pre-defined dataset or knowledge base.



In [None]:
# install necessary libraries
!pip install -q torch transformers transformers accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl pacmap datasets langchain-community ragatouille

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m645.1/647.5 kB[0m [31m26.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.7/86.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install pyarrow==14.0.1


Collecting pyarrow==14.0.1
  Downloading pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Downloading pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 17.0.0
    Uninstalling pyarrow-17.0.0:
      Successfully uninstalled pyarrow-17.0.0
Successfully installed pyarrow-14.0.1


In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import load_dataset
import matplotlib.pyplot as plt

# Raw knowledge Base
from langchain.schema import Document as LangchainDocument
# Chunking phase :
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer


# Indexation phase :
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

# Visualization :
import pacmap
import numpy as np
import plotly.express as px

# Generation phase :
from transformers import pipeline

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


# Reranking :
from ragatouille import RAGPretrainedModel

pd.set_option("display.max_colwidth", None)  # This will be helpful when visualizing retriever outputs

**load dataset**

In [None]:
ds.column_names

In [None]:
ds.features


In [None]:
# We take a sample

ds_ = ds.shuffle(seed=42).select(range(20000))

ds_

**A base containing documents with their richeful metadata**

In [None]:
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=row["abstract"], metadata={"title": row["title"], "authors": row["authors"], "submitter": row["submitter"], "categories" : row["categories"],"journal reference" :row["journal-ref"]}) for row in tqdm(ds_)
]

In [None]:
RAW_KNOWLEDGE_BASE[3]

**Embedding the docs and chunking phase :**

In [None]:
Markdown_seprators = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    ""
]

In [None]:
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=Markdown_seprators,
    )

    docs_processed = []
    for doc in tqdm(knowledge_base):
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in tqdm(docs_processed):
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

# Splitting the documents


docs_processed = split_documents(
    512,  # We choose a chunk size adapted to our model
    RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

Lets save the processed docs

In [None]:
import pickle
# Save the processed documents to a file using pickle
with open('docs_processed.pkl', 'wb') as f:
    pickle.dump(docs_processed, f)

print("docs_processed has been saved as docs_processed.pkl")


**Indexation : loading the embedded chunks into the Faiss Vector DB**

In [None]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

In [None]:
# Save the FAISS index to disk
KNOWLEDGE_VECTOR_DATABASE.save_local('/content/drive/MyDrive/3DpresentationF/codes/index.faiss')


In [None]:
# Embed a user query in the same space
user_query = "Which authors did discuss interesting theories in mathematics?"
query_vector = embedding_model.embed_query(user_query)
print(query_vector)

In [None]:
# Retrieving docs related to the user's query
print(f"\nStarting retrieval for {user_query=}...")
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=5)
print("\n==================================Top document==================================")
print(retrieved_docs[0].page_content)
print("==================================Metadata==================================")
print(retrieved_docs[0].metadata)

**Build Reader (Generation) Model :**

In [None]:
READER_MODEL_NAME = "HuggingFaceH4/zephyr-7b-alpha"

# Quantization :
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    READER_MODEL_NAME,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500
)

Let's save the **Reader LLM**

In [None]:
# Save the model and tokenizer
model.save_pretrained('quantized_model/')
tokenizer.save_pretrained('quantized_model/')

print("Model and tokenizer saved to 'quantized_model/' directory")

**RAG_PROMPT_TEMPLATE creation**

In [None]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive detailled answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the name of the authors, do not provide the number of the document.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
print(RAG_PROMPT_TEMPLATE)

**Constructing the context to pass to the generator LLM**

In [None]:
retrieved_docs_text = [doc.page_content for doc in retrieved_docs]  # We only need the text of the documents
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

# First test :
final_prompt = RAG_PROMPT_TEMPLATE.format(question="Show me papers related to quantum computing with a focus on error correction!", context=context)

# Redact an answer
answer = READER_LLM(final_prompt)[0]["generated_text"]
print(answer)

**Combine all process in one function**

In [None]:
def answer_with_rag(
    question: str,
    llm: pipeline,
    knowledge_index: FAISS,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 5,
) -> Tuple[str, List[LangchainDocument]]:
    # Gather documents with retriever
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # Keep only the text

    # Optionally rerank results
    if reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    print("=> Generating answer...")
    answer = llm(final_prompt)[0]["generated_text"]

    return answer, relevant_docs

In [None]:
# Rerabking the retried docs : improve the process of retrieval
RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

In [None]:
question = "Show me papers related to quantum computing with a focus on error correction!"

answer, relevant_docs = answer_with_rag(question, READER_LLM, KNOWLEDGE_VECTOR_DATABASE, reranker=RERANKER)

In [None]:
print("==================================Answer==================================")
print(f"{answer}")
print("==================================Source docs==================================")
for i, doc in enumerate(relevant_docs):
    print(f"Document {i}------------------------------------------------------------")
    print(doc)

**chatbot_response function**

In [None]:
def chatbot_response(user_query: str) :
  retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=5)
  retrieved_docs_text = [doc.page_content for doc in retrieved_docs]  # We only need the text of the documents
  context = "\nExtracted documents:\n"
  context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])
  final_prompt = RAG_PROMPT_TEMPLATE.format(question=user_query, context=context)

  # Redact an answer
  answer = READER_LLM(final_prompt)[0]["generated_text"]
  return f"Echo: {answer}"

In [None]:
chatbot_response(user_query)

In [None]:
!pip install gradio

In [None]:
import gradio as gr

# Define the chatbot interface
iface = gr.Interface(
    fn=chatbot_response,       # The function to call for each user input
    inputs="text",             # The input type
    outputs="text",            # The output type
    title="Simple Echo Chatbot" # Title of the interface
)

# Launch the interface
iface.launch()


Lets test if **app.py** works well with model resources