<a href="https://colab.research.google.com/github/mellobo05/AI-LLM/blob/main/RAG_Prostate_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
#Make sure you have installed dotenv, langchain_community, langchain_google_genai, pypdf
from google.colab import drive
drive.mount('/content/drive')

pdf_path = '/content/drive/My Drive/Colab Notebooks/chandraprakash-merged.pdf'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.prompts import PromptTemplate
from langchain import hub
from datetime import datetime
from google.colab import userdata
import re


# Load environment variables (although using userdata is preferred in Colab)
load_dotenv()

# Correctly retrieve the API key from Colab Secrets
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
# Ensure the API key is set as an environment variable for langchain to pick it up
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY

# ====== LLM & PROMPT ======
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
qa_prompt = hub.pull('langchain-ai/retrieval-qa-chat')
#Create vector embedding
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

#Load the pdf document into document object (by default it will create one document per page)
#Split the document into smaller chunks for better processing
#Create embedding for the document chunks
#Create a in memory vector store to hold the document chunks and their embedding
#Perform a similarity search in the vector store to retrieve relavent documents
#Invoke the chain with the context and user query

#pdf_loader = PyPDFLoader(pdf_path)
#documents = pdf_loader.load()
#print(documents)

# ====== TEXT CLEANING FUNCTION ======
def clean_text(text):
    # Remove excessive newlines
    text = text.replace("\n", " ").strip()
    # Remove random all-caps gibberish (base64-like)
    text = re.sub(r'\b[A-Z0-9]{8,}\b', '', text)
    # Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text)
    return text

# ====== LOAD & CLEAN PDF ======
pdf_loader = PyPDFLoader(pdf_path)
documents = pdf_loader.load()

# Clean and filter blank pages
cleaned_docs = []
for doc in documents:
    txt = clean_text(doc.page_content)
    if len(txt) > 30:  # skip empty or useless pages
        doc.page_content = txt
        cleaned_docs.append(doc)

# ====== SPLIT INTO CHUNKS ======
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_documents = text_splitter.split_documents(documents)

# ====== CREATE VECTOR STORE ======
vector_store = InMemoryVectorStore(embedding=embedding_model)
vector_store.add_documents(split_documents)

# ===== Custom Prompt =====
qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful assistant. Use the following extracted text from a medical report to answer the question.
If the answer is not explicitly in the text, say "Not found in the document."

Context:
{context}

Question:
{question}

Answer:
"""
)

# ===== Function: Extract PSA with Dates =====
def extract_latest_psa(chunks):
    psa_records = []
    date_pattern = r"\b(\d{1,2}/\d{1,2}/\d{4})\b"
    psa_pattern = r"PSA.*?ng/mL\s*([\d.]+)"  # Fixed regex

    for doc in chunks:
        text = doc.page_content
        date_match = re.search(date_pattern, text)
        psa_match = re.search(psa_pattern, text, re.IGNORECASE)
        if psa_match:
            date_str = date_match.group(1) if date_match else None
            psa_value = psa_match.group(1)
            psa_records.append((date_str, psa_value, text))

    # Sort by date if present
    def parse_date(d):
        try:
            return datetime.strptime(d, "%d/%m/%Y")
        except:
            return datetime.min

    if psa_records:
        psa_records.sort(key=lambda x: parse_date(x[0]), reverse=True)
        latest = psa_records[0]
        return f"Latest PSA result: {latest[1]} ng/mL on {latest[0]}" if latest[0] else f"Latest PSA result: {latest[1]} ng/mL"
    return None

# ===== Keyword-first Retrieval =====
def find_relevant_chunks(user_query):
    keyword = "psa"
    psa_chunks = [doc for doc in split_documents if keyword in doc.page_content.lower()]
    if psa_chunks:
        return psa_chunks
    return vector_store.similarity_search(user_query, k=4)

# ===== Main Loop =====
print("\n✅ PDF loaded, cleaned, and indexed. You can now ask questions.")
print("Example: 'Latest PSA result for Chandraprakash'\n")

while True:
    user_query = input("What would you like to know? ").strip()
    if user_query.lower() in ["exit", "quit"]:
        break

    user_query = re.sub(r"chandraprash", "chandraprakash", user_query, flags=re.I)

    docs = find_relevant_chunks(user_query)

    # Special handling if query asks for latest PSA
    if "latest" in user_query.lower() and "psa" in user_query.lower():
        latest = extract_latest_psa(docs)
        if latest:
            print("\n--- Answer ---\n", latest, "\n")
            continue

    context = "\n\n".join([doc.page_content for doc in docs])
    final_prompt = qa_prompt.format(context=context, question=user_query)
    response = llm.invoke(final_prompt)
    print("\n--- Answer ---\n", response.content, "\n")




✅ PDF loaded, cleaned, and indexed. You can now ask questions.
Example: 'Latest PSA result for Chandraprakash'

What would you like to know? Latest PSA result

--- Answer ---
 Latest PSA result: 11.440 ng/mL on 11/7/2025 



KeyboardInterrupt: Interrupted by user

In [5]:
pip install dotenv



In [11]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.9.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-5.9.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.2/313.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.9.0
