# RAG (Retrieval Augmented Generation) with PHI-3.5

This notebook is a implementation of the RAG model with PHI-3.5.

In [None]:
!pip install langchain langchain_community langchain_huggingface faiss-cpu sentence-transformers pypdf -q

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA, LLMChain
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_groq import ChatGroq

In [5]:
# Load and preprocess the PDF documents
def load_pdfs(file_paths):
    documents = []
    for path in file_paths:
        loader = PyPDFLoader(path)
        documents.extend(loader.load())
    return documents

pdf_files = ["/kaggle/working/books/cancer-principles.pdf", "/kaggle/working/books/science-of-nutrition-2nd-edition.pdf", "/kaggle/working/books/understanding-heart-disease.pdf"]
documents = load_pdfs(pdf_files)

# View page 100 of the first document
str(documents[100])[:100]

"page_content='synthesis process.\nM-PHASE ENTRY AND EXIT\nOnce the cell has copied the entire genome, "

In [28]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=150)
chunks = text_splitter.split_documents(documents)
len(chunks)

41888

In [29]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vectorstore = FAISS.from_documents(chunks, embeddings)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [30]:
%mkdir vectorstore
vectorstore.save_local("/kaggle/working/vectorstore")

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [53]:
# lets try similarity search for a query in the vectorstore
query = "What causes cancer and how to avoid it?"
k = 2  # Number of results to return
vectorstore.similarity_search(query, k=k)

[Document(metadata={'source': '/kaggle/working/books/cancer-principles.pdf', 'page': 191}, page_content="of cancer will be needed to extend the ongoing increases in patient survival.\nCAUSES OF CANCER\nThe causes of the malignancies are as diverse as the causes of disease in general. They include genetic defects, environmental and lifestyle factors, and chemical, \nphysical and biologic agents. We address primarily the known human carcinogens but will mention a few suspect agents in context . Our list of 72 causes, or groups of \ncauses Table 14.3-1  is adapted from the IARC's Group 1 list of 75 agents, mixtures and exposure circumstances for which there was judged to be suf ficient evidence"),
 Document(metadata={'source': '/kaggle/working/books/science-of-nutrition-2nd-edition.pdf', 'page': 425}, page_content='Heredity,Lifestyle Choices,and Infectious and Environmental Agents Can Increase Cancer Risk\nCancer is the second leading cause of death in the United States, and researchers e

In [32]:
os.environ["GROQ_API_KEY"] = "api_key"  # Set your Groq API key
llm = ChatGroq(model_name="llama-3.1-8b-instant", temperature=0.7)

In [33]:
from langchain_core.prompts import ChatPromptTemplate

user_template = """You are a helpful medical AI assistant.
instructions:
- Answer the query based on the context inside the context tags. However, If the user asks something that doesn't need context, just ignore the context provided, and jus respond normally as the user doesn't know anything about the context.
<context>
{context}
</context>
Query:
{input}
Answer: """
prompt = ChatPromptTemplate.from_template(user_template)

In [34]:
from langchain.chains.combine_documents import create_stuff_documents_chain

doc_chain = create_stuff_documents_chain(llm, prompt)

retriever = vectorstore.as_retriever()

In [35]:
from langchain.chains import create_retrieval_chain

chain = create_retrieval_chain(retriever, doc_chain)

In [46]:
# Try the chain
response = chain.invoke("What causes cancer and how to avoid it?")

{'input': 'hi',
 'context': [Document(metadata={'source': '/kaggle/working/books/science-of-nutrition-2nd-edition.pdf', 'page': 9}, page_content='Hello. My name is Gustavo. Around 46 years ago, when I was 13, I came tothe United States from Mexico with my parents and three sisters to pick\ncrops in California, and now I manage a large vineyard. They ask me when I’m'),
  Document(metadata={'source': '/kaggle/working/books/science-of-nutrition-2nd-edition.pdf', 'page': 22}, page_content='A03_THOM3162_02_SE_FM.QXD  12/1/09  2:22 PM  Page xxii'),
  Document(metadata={'source': '/kaggle/working/books/science-of-nutrition-2nd-edition.pdf', 'page': 890}, page_content='Z09_THOM3162_02_SE_AKEY.QXD  11/30/09  5:04 PM  Page AN-16'),
  Document(metadata={'source': '/kaggle/working/books/science-of-nutrition-2nd-edition.pdf', 'page': 8}, page_content='Hi, I’m Theo. Let’s see, I’m 21, and my parents moved to the Midwest fromNigeria 11 years ago. The first time I ever played basketball, in middle\nsc