# RAG (Retrieval Augmented Generation) with PHI-3.5

This notebook is a implementation of the RAG model with PHI-3.5.

In [1]:
!pip install langchain langchain_groq langchain_community langchain_huggingface faiss-cpu sentence-transformers pypdf -q

In [None]:
%mkdir books

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA, LLMChain
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_groq import ChatGroq
import os

In [2]:
# Load and preprocess the PDF documents
def load_pdfs(file_paths):
    documents = []
    for path in file_paths:
        loader = PyPDFLoader(path)
        documents.extend(loader.load())
    return documents

pdf_files = ["books/Cancer-Principles.pdf"]
documents = load_pdfs(pdf_files)

# View page 101 of the first document (first 100 characters)
str(documents[100])[:100]

"page_content='synthesis process.\nM-PHASE ENTRY AND EXIT\nOnce the cell has copied the entire genome, "

In [3]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                                chunk_overlap=100,
                                                length_function=len)
chunks = text_splitter.split_documents(documents)

print(f"Number of chunks: {len(chunks)}")

Number of chunks: 50093


In [5]:
# Embed the chunks and save them in a vectorstore
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vectorstore = FAISS.from_documents(chunks, embedder) if not os.path.exists("vectorstore") else FAISS.load_local("vectorstore", embedder, allow_dangerous_deserialization=True)

# save the vectorstore
vectorstore.save_local("vectorstore") if not os.path.exists("vectorstore") else None

In [None]:
# lets search for a query in the vectorstore
query = "What is cancer?"
k = 5  # Number of results to return
similar_docs = vectorstore.similarity_search(query, k=k)

In [24]:
os.environ["GROQ_API_KEY"] = "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"  # Set your Groq API key
llm = ChatGroq(model_name="llama-3.1-8b-instant", temperature=0.7)

Document(metadata={'source': 'books/Cancer-Principles.pdf', 'page': 2373}, page_content='BCC-80.  Multiple basal cell carcinomas in a patient who received radiation therapy for acne 50 years earlier. Any new therapeutic moda lity that is practiced on the \nskin today may yield unanticipated complications decades later. In most cases, it is difficult to anticipate what those effects  might be, and the need for effective current \nmanagement typically outweighs unknown delayed risks.')

In [None]:
# Create a retrieval chain
retrieval_chain = RetrievalQA.from_chain_type(
    chain_type=
)