In [None]:
print("ok")

In [None]:
%pwd

In [None]:
import os
os.chdir("../")

In [None]:
%pwd

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Extract data from PDF
def load_pdf_file(data):
    loader = DirectoryLoader(data, glob="*.pdf",loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [None]:
extracted_data = load_pdf_file(data="Data/")

In [None]:
#extracted_data

In [None]:
# Split the data into text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [None]:
text_chunks = text_split(extracted_data)
print(f"Number of text chunks: {len(text_chunks)}")

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
# Download the embeddings from HuggingFace
def download_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    return embeddings

In [None]:
embeddings = download_embeddings()

In [None]:
query_result = embeddings.embed_query("Hello")
print(f"Query result length: {len(query_result)}")

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")

In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "genai-medical-chatbot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [None]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [None]:
# Embed each chunk and upsert the embeddings into your Pinecone index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    text_chunks,
    embeddings,
    index_name=index_name,
    namespace="genai-medical-chatbot"
)

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
    namespace="genai-medical-chatbot"
)

In [None]:
retriever = docsearch.as_retriever(search_type = "similarity",search_kwargs={"k": 3})

In [None]:
retriever_docs = retriever.invoke("What is acne?")

In [None]:
retriever_docs

In [None]:
from langchain_ollama import OllamaLLM

llm = OllamaLLM(
    model="gemma:2b",
    temperature=0.4,
    num_predict=500  # equivalent to max_tokens
)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import  create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question answering tasks."
    "Use the following pieces of retrieved context to answer the question."
    "If you don't know the answer, just say that you don't know."
    "Do not try to make up an answer."
    "Use 3 sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

In [None]:
question_answer_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=prompt,
)

rag_chain = create_retrieval_chain(
    retriever=retriever,
    combine_docs_chain=question_answer_chain,
)

In [None]:
response = rag_chain.invoke({"input": "What is acne?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "Do you know Niloy?"})
print(response["answer"])