In [None]:
!pip install langchain_chroma langchain_groq langchain_core langchain_community langchain_text_splitters pypdf gradio sentence-transformers

In [None]:
import os
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters.sentence_transformers import SentenceTransformersTokenTextSplitter
from langchain_chroma import Chroma

In [None]:
# Create directory for persistent DB
os.makedirs("pharma_db", exist_ok=True)

In [None]:
# Initialize embedding model and vector DB
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
db = Chroma(collection_name="pharma_database", embedding_function=embedding_model, persist_directory="pharma_db")

In [None]:
# Prompt template
PROMPT_TEMPLATE = """
You are a highly knowledgeable assistant specializing in pharmaceutical sciences.
Answer the question based only on the following context:
{context}

Answer the question based on the above context:
{question}

Use the provided context to answer the user's question accurately and concisely.
Don't justify your answers.
Don't give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.
"""

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
output_parser = StrOutputParser()

In [None]:
# Helper function to format retrieved documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
# Function to process uploaded PDF files
def process_documents(file_paths):
    for file_path in file_paths:
        loader = PyPDFLoader(file_path)
        data = loader.load()

        doc_metadata = [doc.metadata for doc in data]
        doc_content = [doc.page_content for doc in data]

        text_splitter = SentenceTransformersTokenTextSplitter(
            model_name="sentence-transformers/all-mpnet-base-v2",
            chunk_size=100,
            chunk_overlap=50
        )
        chunks = text_splitter.create_documents(doc_content, doc_metadata)
        db.add_documents(chunks)

    return "‚úÖ Documents processed and added to database."

In [None]:
# Function to Query using RAG 
def run_query(query, groq_api_key):
    # Set up the retriever. 
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5}) 
    
    llm = ChatGroq(
        model="llama-3.3-70b-versatile",
        api_key=groq_api_key,
        temperature=1
    )

    # Define the RAG chain
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt_template
        | llm
        | output_parser
    )

    result = rag_chain.invoke(query)
    return result

In [1]:
# Example Usage 
if __name__ == "__main__":
    # Optional: Process some PDF files first
    pdf_paths = ["/content/sample_data/AI_in_Drug_Discovery.pdf", "/content/sample_data/Vaccine_Development_Workflow.pdf"]
    # This line assumes you have 'sample_data/AI_in_Drug_Discovery.pdf' and 'sample_data/Vaccine_Development_Workflow.pdf' files available
    # Note: If these files don't exist, this cell will fail.
    process_documents(pdf_paths) 

    # Run a query
    groq_api_key = "YOUR_GROQ_API_KEY" # **REPLACE with your actual key**
    user_query = "What are the AI applications in drug discovery?"
    answer = run_query(user_query, groq_api_key)
    print("Answer:", answer)


Answer: Artificial intelligence (AI) applications in drug discovery include: 
1. Identifying promising drug candidates by analyzing large chemical databases using machine learning.
2. Predicting molecular interactions and drug-target affinities using deep learning models.
3. Patient stratification in clinical trials.
4. Drug repurposing efforts. 


In [None]:
import gradio as gr

# Note: The imports and core functions (format_docs, process_documents, run_query) are assumed to be defined in previous cells.

# Main Gradio interface function
def pharma_query_interface(query, groq_api_key, files):
    if files:
        # Process new documents if uploaded
        process_documents(files)
        
    if not query or not groq_api_key:
        return "‚ö†Ô∏è Please enter a query and your GROQ API key."
        
    return run_query(query, groq_api_key)

# Gradio UI
iface = gr.Interface(
    fn=pharma_query_interface,
    inputs=[
        gr.Textbox(label="Pharmaceutical Question", placeholder="e.g., What are the AI applications in drug discovery?"),
        gr.Textbox(label="Groq API Key", type="password"),
        gr.File(label="Upload PDF documents (optional)", file_types=[".pdf"], file_count="multiple")
    ],
    outputs=gr.Textbox(label="RAG Answer", lines=10),
    title="üíä PharmaAssist - RAG for Pharmaceutical Research",
    description="Upload pharmaceutical research PDFs and ask questions using Groq's LLaMA3 and HuggingFace Embeddings."
)

# Launch the Gradio app
if __name__ == "__main__":
    # Note: Gradio interfaces often require a restart of the kernel in notebook environments to stop cleanly.
    iface.launch()