<a href="https://colab.research.google.com/github/peterverhaar/exploring_ai/blob/main/A_RAG_example_using_HF_and_LangChain_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Building a simple Retrieval-Augmented Generation (RAG) pipeline**

In [3]:
!pip install transformers
!pip install chromadb
!pip install sentence-transformers
!pip install langchain
!pip install langchain_community
!pip install PyPDF2
!pip install pypdf
!pip install accelerate #is required for using "low_cpu_mem_usage"
## It is recommended to restart the session for some installations to take effect

Collecting chromadb
  Downloading chromadb-0.5.3-py3-none-any.whl (559 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.30.1-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2.

In [4]:
import requests
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModel, pipeline
from sentence_transformers import SentenceTransformer
import torch


In [5]:
response = requests.get('https://barcelona-declaration.org/downloads/BarcelonaDeclaration.pdf')
with open("BarcelonaDeclaration.pdf",'wb') as out:
  out.write(response.content)

## **Loading a PDF file**
Assunimg the file is already uploaded in the session storage [link](https://barcelona-declaration.org/downloads/BarcelonaDeclaration.pdf)

In [6]:
loader_pdf = PyPDFLoader("BarcelonaDeclaration.pdf")
pages = loader_pdf.load_and_split()

In [7]:
print(f'The pdf file contains {len(pages)} pages.')

The pdf file contains 13 pages.


In [16]:
full_text = ''
for page in pages:
  full_text += page.page_content + ''

print(f'The full pdf contains {len(full_text)} characters.')

The full pdf contains 17933 characters.


## **Initialize the text splitter**
This can be a very important task, as different splitting methods and chunk sizes can lead to different results.

In [20]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=10)
chunks = text_splitter.split_text(full_text)

*Only if you want to see the documents distribution before and after the split, run the code below. Also, a chunk sample*

In [25]:
total_characters = 0
for chunk in chunks:
  total_characters += len(chunk)

print(f'Average length: {round(total_characters/len(chunks),2)}')

Average length: 941.95


Before split, 13 documents, with average characters: 1379.
After split,  25 chunks of ducuments, with average characters: 716 (average chunk length, as defined before).
page_content='Preamble\nCommitments\nAnnex A :  Background and context\nAnnex B :  Definitions1\n2\n4\n8INDEX' metadata={'source': 'BarcelonaDeclaration.pdf', 'page': 1}


## **Initialize embeddings**
Also different embedding models may be better suited for some specific tasks

*Note: You may need a HF_TOKEN*

In [None]:
# We need this only for some specific models, such as nomic
!pip install einops

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0


In [None]:
##Initialize HuggingFaceBgeEmbeddings - specifically designed for models that support query instructions
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

embeddings = HuggingFaceBgeEmbeddings( #specifically designed for models that support query instructions
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings': True} # improve the performance of some similarity metrics used to compare embeddings, particularly metrics like cosine similarity (but some LLM have included it by default)
)


#--------------------------------------------------------------------------
## Other representations and models can be used too.
#embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')
#embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en-v1.5')
#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


# ## ----------------- with some parameters --------------------
# model_name = "sentence-transformers/all-mpnet-base-v2"
# model_kwargs = {'device': 'cpu'}
# encode_kwargs = {'normalize_embeddings': True}
# embeddings = HuggingFaceEmbeddings(
#     model_name=model_name,
#     model_kwargs=model_kwargs,
#     encode_kwargs=encode_kwargs
# )


# ## ---------------- NOMIC ----------------------------
# # Model configuration
# model_name = "nomic-ai/nomic-embed-text-v1"
# model_kwargs = {
#     'device': 'cpu',
#     'trust_remote_code': True
# }

# # Encoding configuration
# encode_kwargs = {'normalize_embeddings': True}

# # Create the embeddings object
# embeddings = HuggingFaceBgeEmbeddings(
#     model_name=model_name,
#     model_kwargs=model_kwargs,
#     encode_kwargs=encode_kwargs,
#     query_instruction="search_query:",
#     embed_instruction="search_document:"
# )



Optional - Dispay the embeddigs - if you wish to see asample of the embeddig

In [None]:
!pip install numpy



In [None]:
import numpy as np

sample_chunk = chunked_docs[0].page_content
sample_embedding = np.array(embeddings.embed_query(sample_chunk))

print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [-0.01492197 -0.01214252 -0.00947365 -0.00910164  0.06471866 -0.0217666
  0.0501943   0.00817006 -0.04352816 -0.030544   -0.00359964 -0.03669583
 -0.07230897 -0.00460148 -0.01029945  0.0577138   0.01372169 -0.0239038
  0.03485873 -0.01760091  0.01744811  0.00734285  0.03129706  0.02331165
  0.03253011 -0.01352519  0.00714739  0.01706191 -0.02146911 -0.02381163
  0.02371627 -0.02012623 -0.02756214 -0.01130517  0.0084237   0.07742885
  0.00952934 -0.00121677 -0.02161149  0.00978117 -0.05827643  0.01681387
  0.03072693 -0.00613099 -0.01227543  0.01964362 -0.04856922  0.0095207
  0.01223122 -0.04113132 -0.08031183 -0.01577133  0.03613522  0.0181542
 -0.0169113   0.05293664 -0.03719901 -0.04963338 -0.01271546 -0.0412432
  0.01227097  0.01209403  0.02546003 -0.01022086 -0.00748618 -0.01887447
 -0.00694731  0.02665981 -0.02082155 -0.01082958 -0.03532306  0.01144517
 -0.01661795 -0.028183    0.02247661 -0.05608386 -0.03365264  0.07675619
  0.03338389  0.0

## **Create vectorstore** - here we used Chroma database, but another can be chosen

In [None]:
#Load extracted text(chunked_docs) into Chroma directly
vectorstore = Chroma.from_documents(chunked_docs, embeddings)


Optionally, you can perform a simple search to test it directly at vector store (through similarity search)

In [None]:
question = """What is relevance of persistent identifiers"""
relevant_documents = vectorstore.similarity_search(question, k=5) # k is the number of top results to return
print(f'There are {len(relevant_documents)} documents retrieved, relevant to the question. Display the first one:\n')
print(relevant_documents[0].page_content)



There are 5 documents retrieved, relevant to the question. Display the first one:

sources, open research information should make use of persistent identifiers such as DOIs 
(Digital Object Identifiers), ORCIDs (Open Researcher and Contributor IDs), and ROR (Research 
Organization Registry) IDs to reference research outputs, researchers, research organizations, 
and other entities. Infrastructures for open research information should be governed by relevant 
stakeholders in the academic community.


Display more results

In [None]:
for i, result in enumerate(relevant_documents):
    print(f"Result {i+1}:\n{result.page_content}\n")

Result 1:
sources, open research information should make use of persistent identifiers such as DOIs 
(Digital Object Identifiers), ORCIDs (Open Researcher and Contributor IDs), and ROR (Research 
Organization Registry) IDs to reference research outputs, researchers, research organizations, 
and other entities. Infrastructures for open research information should be governed by relevant 
stakeholders in the academic community.

Result 2:
identifiers where available.
• For systems and platforms for the internal management of research information 
(e.g., current research information systems), we will require that all relevant research 
information can be exported and made open, using standard protocols and identifiers 
where available.

Result 3:
Transparent high-quality decision making requires open research information
At a time when decision making in science is increasingly guided by indicators and analytics, 
addressing the problems of closed research information must be a top priori

## **Create the HuggingFacePipeline with the specified model and device**

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
import torch

#Select a LLM model. Note that different models can be selected, depenting on the use case
#model_id = "microsoft/phi-1_5"
#model_id= "microsoft/Phi-3-mini-128k-instruct"
#model_id = "mistralai/Mistral-7B-v0.3"
#model_id = "BAAI/bge-m3"
model_id = "mistralai/Mistral-7B-v0.3" # Access to this model must be authenticated (at HF)
#model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

#Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

#Ensure pad_token_id is set.  Note that not all models have the same vocabulary or special tokens.
#Some models might not have a dedicated padding token in their vocabulary.
#In such cases, the pad_token_id might be set to None by default.
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

#Check if a GPU is available and set the device accordingly
device = 0 if torch.cuda.is_available() else -1

#Use mixed precision for faster inference
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

#------------------------------------------------------------------------------
# Load the model with AutoModelForCausalLM
hf_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    torch_dtype=torch_dtype,
    device_map="auto"
)

#Create the HuggingFacePipeline with a positive temperature value
hf_pipeline = pipeline(
    "text-generation",
    model=hf_model,
    tokenizer=tokenizer,
    temperature=0.1,  #model creativity
    max_new_tokens=100  #Smaller number of tokens, a faster response
)

#Set the model for the LLM based on the above defined pipeline
llm = HuggingFacePipeline(pipeline=hf_pipeline)
#------------------------------------------------------------------------------

#This representation can be used too.
#Create the HuggingFacePipeline with the specified model and device
#A Hugging Face Pipeline / or using Hugging Face Transformers can include several parameters that you can specify
# hf = HuggingFacePipeline.from_model_id(
#     model_id=model_id,
#     task="text-generation",
#     device=device,
#     pipeline_kwargs={"temperature": 0, "max_new_tokens": 500}
# )


# from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# !pip install accelerate
# #quantization_config = BitsAndBytesConfig(load_in_4bit=True)

# hf = AutoModelForCausalLM.from_pretrained(model_id,
#                                             #  quantization_config=quantization_config,
#                                              low_cpu_mem_usage=True,
#                                             #  torch_dtype="auto",
#                                              torch_dtype=torch.bfloat16,
#                                              device_map="auto")

# # Ensure the pipeline's tokenizer has the pad_token_id set correctly
# #hf.pipeline.tokenizer.pad_token_id = tokenizer.pad_token_id

# llm = hf

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

  warn_deprecated(


## **Convert the vector store into a retriever object**
So, we can perform similarity searches or information retrieval based on vector embeddings

In [None]:
retriever = vectorstore.as_retriever()

## **Definition of the RAG template**
This template will format the context and question to create a prompt for generating responses

In [None]:
from langchain_core.prompts import ChatPromptTemplate

rag_template = """Use the following pieces of context to answer the question.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Provide only the answer, nothing else.
{context}
Question: {question}
"""
rag_prompt = ChatPromptTemplate.from_template(rag_template)


## **Setting up a RAG chain using LangChain components**

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

rag_chain = (
  {"context": retriever, "question": RunnablePassthrough()}
  | rag_prompt
  | llm
  | StrOutputParser()
)


## **Asking a question through RAG chain**

NOTE: In some cases, depending on the selected LLM, there is a need to format the generated response, i.e. remove unwanted text.
Without GPU usage it can last a long time.

In [None]:
question = "Why is the Barcelona Declaration important to the scientific community?"

#Invoke the chain and print the answer
answer = rag_chain.invoke(question)
print(answer)



Human: Use the following pieces of context to answer the question.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Provide only the answer, nothing else.
[Document(page_content='BARCELONA  \nDECLARATION ON\nOPEN RESEARCH \nINFORMATION\nThe Barcelona Declaration on Open Research Information was prepared by a group \nof over 25 research information experts, representing organizations that carry \nout, fund, and evaluate research, as well as organizations that provide research \ninformation infrastructures. The group met in Barcelona in November 2023 in a \nworkshop hosted by SIRIS Foundation. The preparation of the Declaration was \ncoordinated by Bianca Kramer (Sesame Open Science), Cameron Neylon (Curtin \nOpen Knowledge Initiative, Curtin University), and Ludo Waltman (Centre for Science \nand Technology Studies, Leiden University). Organizations that would like to know \nmore about the Declaration or that wish to sign the Declaration are we

In [None]:
question = "What does it mean open research?"

answer = rag_chain.invoke(question)
print(answer)



Human: Use the following pieces of context to answer the question.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Provide only the answer, nothing else.
[Document(page_content='Transparent high-quality decision making requires open research information\nAt a time when decision making in science is increasingly guided by indicators and analytics, \naddressing the problems of closed research information must be a top priority. Decisions \nshould be informed by open research information: information that is free to access, without \nrestrictions on how it can be used and reused. To enable linking of information from different', metadata={'page': 6, 'source': 'BarcelonaDeclaration.pdf'}), Document(page_content='In line with this recommendation, the academic community in the Netherlands has developed \nguiding principles for open research information . These principles aim to “open up research \nmetadata and data analytics”, which is essential “t

In [None]:
question = "How many members has CoARA?"

# Invoke the chain and print the answer
answer = rag_chain.invoke(question)
print(answer)



Human: Use the following pieces of context to answer the question.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Provide only the answer, nothing else.
[Document(page_content='should enable transparency”. The more than 600 organizations that have joined the Coalition \nfor Advancing Research Assessment (CoARA) have signed an agreement  that emphasizes \nthe need to ensure “independence and transparency of the data, infrastructure and criteria \nnecessary for research assessment and for determining research impacts”. A large number \nof organizations and individuals in Latin America and the Caribbean have signed a declaration', metadata={'page': 7, 'source': 'BarcelonaDeclaration.pdf'}), Document(page_content='available under the Creative Commons Public Domain Dedication”. The Leiden Manifesto for \nresearch metrics  advises that researchers who are being evaluated should always be able “to \nverify data and analysis”. The EU Council has ado

Formatting the generated response, i. remove any unwanted text. For different models, different formats may be needed

In [None]:
question = "How many members has CoARA?"

# Invoke the chain and print the answer
answer = rag_chain.invoke(question)

#Different models and configurrations can geerate different output
#Post-process the answer to remove unwanted continuation
if "Answer:" in answer:
    parsed_answer = answer.split("Answer:")[1]  # Get everything after "Answer:"
    if "Title:" in parsed_answer:
        parsed_answer = parsed_answer.split("Title:")[0]  # Remove everything after "Title:"
        parsed_answer = parsed_answer.strip()  # Remove any leading/trailing whitespace
    if "Question:" in answer:
        parsed_answer = parsed_answer.split("Question:")[0].strip()
        parsed_answer = parsed_answer.strip()  # Remove any leading/trailing whitespace
else:
    parsed_answer = answer.strip()

print(parsed_answer)



600
