## Step 1: Initialize an Embedding Model

In [30]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001"
)


In [4]:
#pip install langchain chromadb


## Step 2: Initialize the Chroma DB Connection

In [5]:
from langchain.vectorstores import Chroma

db = Chroma(
    collection_name="pharma_database",
    embedding_function=embedding_model,
    persist_directory='./pharma_db'
)


  warn_deprecated(


In [6]:
db.get()

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

## Start 3: Load necessary documents

In [7]:
#pip install pypdf

In [8]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader

loader = DirectoryLoader(
    path="research-papers", glob="*.pdf", show_progress=True, loader_cls=PyPDFLoader
)

data = loader.load()

100%|██████████| 7/7 [00:06<00:00,  1.03it/s]


In [9]:
print("Type of Data Variable: ", type(data))
print()
print("Number of Documents: ", len(data))
print()
print("Type of each datapoints:", type(data[0]))
print()
print("Metadata: ", data[0].metadata)
print()
print("Page Content:")
print(data[0].page_content[:200])

Type of Data Variable:  <class 'list'>

Number of Documents:  63

Type of each datapoints: <class 'langchain_core.documents.base.Document'>

Metadata:  {'source': 'research-papers\\2060-AI-in-Life-Sciences.pdf', 'page': 0}

Page Content:
Executive Insights
Artificial Intelligence in Life Sciences: The Formula for Pharma Success Across the Drug Lifecycle was written by  
Clay Heskett, Partner, Ben Faircloth, Partner, and Stephen Roper,


In [10]:
data[38].metadata

{'source': 'research-papers\\AI-In-Pharmacy.pdf', 'page': 0}

## Step 4: Split the document into chunks

In [11]:
doc_metadata = [data[i].metadata for i in range(len(data))]
doc_content = [data[i].page_content for i in range(len(data))]

In [12]:
doc_metadata[0], doc_content[0][:100]

({'source': 'research-papers\\2060-AI-in-Life-Sciences.pdf', 'page': 0},
 'Executive Insights\nArtificial Intelligence in Life Sciences: The Formula for Pharma Success Across t')

In [13]:
#pip install tf-keras

In [14]:
from langchain_text_splitters.sentence_transformers import SentenceTransformersTokenTextSplitter

st_text_splitter = SentenceTransformersTokenTextSplitter(model_name="sentence-transformers/all-mpnet-base-v2", 
                                                         chunk_size=100, 
                                                         chunk_overlap=50)

st_chunks = st_text_splitter.create_documents(doc_content, doc_metadata)

In [15]:
print("Total number of documents inside chunks:", len(st_chunks))
print()
for i, chunk in enumerate(st_chunks, start=1):
    print(f"Document {i} metadata: {chunk.metadata}")
    print(f"Document {i} chunks: {chunk.page_content[:100]}")
    if i == 5: break
    print("-" * 100)

Total number of documents inside chunks: 167

Document 1 metadata: {'source': 'research-papers\\2060-AI-in-Life-Sciences.pdf', 'page': 0}
Document 1 chunks: executive insights artificial intelligence in life sciences : the formula for pharma success across 
----------------------------------------------------------------------------------------------------
Document 2 metadata: {'source': 'research-papers\\2060-AI-in-Life-Sciences.pdf', 'page': 0}
Document 2 chunks: iteratively optimize their processes. within life sciences, we apply the term “ ai ” to four major a
----------------------------------------------------------------------------------------------------
Document 3 metadata: {'source': 'research-papers\\2060-AI-in-Life-Sciences.pdf', 'page': 1}
Document 3 chunks: executive insights page 2 l. e. k. consulting / executive insights, volume xx, issue 60 ai ’ s abili
----------------------------------------------------------------------------------------------------
Document 4 meta

## Step 5: Add Chunks to ChromaDB

In [16]:
db.add_documents(st_chunks)

['7187108c-47eb-47a2-9c5e-db6b53b68c4d',
 '317f157b-6fc1-485c-b172-ff1a7ca46504',
 '0a8d0c7b-d9d0-4bbf-9d1e-f22244fc6490',
 'a6c2f180-a994-4d1e-adee-740aa33f94f4',
 'ec98e0d6-9b88-4df0-ac13-eecea406b097',
 '15973eda-45df-4f33-87fc-e5019c355195',
 'd1f36330-98c9-4dba-ac55-5347517a8780',
 '265ab9e8-5596-4d42-8155-5e15981847e0',
 '9eff62f7-d490-4808-8e0f-a051de9e9bb3',
 'ebe167e8-a9e9-4ca3-8a5f-5ec84429e05b',
 '262bc811-d67f-46d3-8c66-abe255c31549',
 '9132d5e8-f501-49b6-a912-8e8d12d82269',
 'f2e2583e-f007-49d0-99cd-97a95647e439',
 '6132ae49-3f6c-480b-9426-89993f3e938b',
 '2a418cba-cd06-419b-9e65-ed4dbe80e770',
 '047204e8-fb9b-448f-ab56-04fc87a5e8b9',
 '7b518628-4750-41e6-9191-bf9fc581aa62',
 'b5ffe79c-5132-4fee-8072-2de253aeaff9',
 'cf7b0182-c619-4126-87e0-171edfae0b72',
 'e5f3f687-6e4a-445e-9255-5dcf7144b90d',
 '41e4f81c-796e-448c-91e2-a563c573ae62',
 'cc611801-6c45-49f1-b88b-53417b238a79',
 '6bb9d805-d9c5-496d-8148-818025af81b9',
 '5c132b3d-ffe5-4db3-a282-24ceb1f3f78f',
 'ab09b7af-26a9-

In [17]:
db.get()['documents'][1][:100]

'is another aspect of ai and it involves designing and building machines with the ability to use sens'

In [18]:
db.get()['metadatas'][:5]

[{'page': 15,
  'source': 'research-papers\\Advancements-and-Applications-of-AI.pdf'},
 {'page': 1, 'source': 'research-papers\\4839-AI-In-Pharmacy-Article.pdf'},
 {'page': 15,
  'source': 'research-papers\\Advancements-and-Applications-of-AI.pdf'},
 {'page': 7, 'source': 'research-papers\\AI-In-Pharmacy.pdf'},
 {'page': 14,
  'source': 'research-papers\\Advancements-and-Applications-of-AI.pdf'}]

## Step 6: Create a Retriever Object and apply Similarity Search

In [19]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={'k': 5})

## Step 7: Initialize a Chat Prompt Template

In [20]:
from langchain_core.prompts import ChatPromptTemplate

PROMPT_TEMPLATE = """
You are a highly knowledgeable assistant specializing in pharmaceutical sciences. 
Answer the question based only on the following context:
{context}

Answer the question based on the above context:
{question}

Use the provided context to answer the user's question accurately and concisely.
Don't justify your answers.
Don't give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.
"""

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

## Step 8: Initialize a Generator (i.e. Chat Model)

In [23]:
from dotenv import load_dotenv
load_dotenv()

True

In [24]:
from langchain_google_genai import ChatGoogleGenerativeAI

chat_model = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=1
)

## Step 9: Initialize a Output Parser

In [25]:
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

## Step 10: Define a RAG Chain

In [26]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt_template | chat_model | output_parser

## Step 11: Invoke the Chain

In [27]:
query = "What is Pharmaceutical industry?"

rag_chain.invoke(query)

'The pharmaceutical industry is encountering constraints due to the limited scope of modern advances, resulting in a complicated and time-consuming drug development process with high risks and costs.  It relies on traditional procedures and human intuition, leading to errors, inefficiencies, and delays.  The current system struggles to personalize treatments, contributing to healthcare costs, patient care problems, and pharmaceutical shortages, resulting in poor patient outcomes, including fatalities.  It also faces long wait times for compensation for innovative medications.  However, the industry is adopting AI to improve processes, from drug discovery and development to clinical trials and manufacturing, aiming for increased efficiency, innovation, and better patient outcomes.\n'

In [28]:
from IPython.display import display
from IPython.display import Markdown
import textwrap

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [29]:
query = "What are the different AI applications used in pharmaceutical industry?"

response = rag_chain.invoke(query)

to_markdown(response)

> AI is used in drug discovery, optimizing clinical processes (recruitment and patient monitoring), post-development patient monitoring, compliance monitoring, marketing optimization, chatbots for patient questions, drug design and manufacturing, automation in pharmaceutical manufacturing (packaging and labeling), creating and refining nano-scale drug delivery systems, predicting properties of drug formulations, and automating business processes.
