In [7]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from langchain.embeddings import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain_community.vectorstores import Pinecone as Pine
import os

In [8]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") # or set via environment outside script
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = "gcp-starter"
index_name = "medical-chatbot"

In [15]:
# 2. Load PDF documents
pdf_dir = "./data"
loader = PyPDFDirectoryLoader(pdf_dir)
documents = loader.load()

In [16]:
# 3. Split documents into text chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Experiment as needed
    chunk_overlap=20
)
text_chunks = splitter.split_documents(documents)

In [17]:
texts = [chunk.page_content for chunk in text_chunks]

In [10]:
def download_hf_embedding():
  embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
  )
  return embeddings

In [11]:
embeddings = download_hf_embedding()

  embeddings = HuggingFaceEmbeddings(


In [None]:
query_result = embeddings.embed_query()
print("Length", len(query_result))

Length 384


In [21]:
pc = Pinecone(api_key=PINECONE_API_KEY)

if index_name not in [index.name for index in pc.list_indexes()]:
    # Create a serverless index; metric and dimension must be provided
    pc.create_index(
        index_name,
        dimension=384,  # Must match your vector dimension!
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
# Connect to index
index = pc.Index(index_name)

# 6. Prepare data and upsert to Pinecone
    # Step 1: Generate embeddings for all your texts
embedding_vectors = embeddings.embed_documents(texts)

vectors = [
    (f"doc-{i}", embedding_vectors[i], {"text": texts[i]})
    for i in range(len(texts))
]

In [23]:
batch_size = 100  # start small, can increase as long as under 4MB
for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i+batch_size]
    index.upsert(vectors=batch)
    
print("✅ PDF processing and upsert complete.")

✅ PDF processing and upsert complete.


In [37]:
docsearch = Pine(
    index,          # your pinecone.Index object
    embeddings,          # your embedding function instance
    "text"               # the name of the metadata field used for text
)

In [None]:
query = "Can you explain lupus"  # must be str
docs = docsearch.similarity_search(query, k=5)

for doc in docs:
  print(doc.page_content)

Antipruritic drugs see Anti-itch drugs
Antipsychotic drugs
Definition
Antipsychotic drugs are a class of medicines used to
treat psychosis and other mental and emotional condi-
tions.
Purpose
Psychosis is defined as “a serious mental disorder
(as schizophrenia ) characterized by defective or lost
contact with reality often with hallucinations or delu-
sions.” Psychosis is an end-stage condition arising from a
variety of possible causes. Anti-psychotic drugs control
than in adults, which may lead to a misdiagnosis of schiz-
ophrenia. Children and adolescents also tend toward irri-
tability and aggressiveness instead of elation. Further,
symptoms tend to be chronic, or ongoing, rather than
acute, or episodic. Bipolar children are easily distracted,
impulsive, and hyperactive, which can lead to a misdiag-
nosis of attention deficit hyperactivity disorder (ADHD).
Furthermore, their aggression often leads to violence,
caused by constriction of the blood vessels in the
extremities, and occur

In [44]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [45]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [46]:
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [None]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [None]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])