In [None]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [None]:
def load_pdf_files(directory_path):
    # Verify directory exists
    if not os.path.exists(directory_path):
        print(f"Directory not found: {directory_path}")
        return []
    
    # List PDF files in directory
    pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
    print(f"Found {len(pdf_files)} PDF files: {pdf_files}")
    
    # Load documents
    loader = DirectoryLoader(
        directory_path,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader  # Note: Changed from PyPDFDirectoryLoader
    )
    
    try:
        documents = loader.load()
        print(f"Successfully loaded {len(documents)} documents")
        return documents
    except Exception as e:
        print(f"Error loading documents: {str(e)}")
        return []

In [None]:
# Test the function
data_path = "/home/shawon/Desktop/SpendX/MediBots/research/data"
extract_data = load_pdf_files(directory_path=data_path)

# Print first document content if available
if extract_data:
    print("\nFirst document content preview:")
    print(extract_data[0].page_content[:200])

In [None]:
extract_data

In [None]:
def text_split(extract_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        separators=["\n\n", "\n", " ", ""]
        
      
    )
    texts = text_splitter.split_documents(extract_data)
    print(f"Split into {len(texts)} chunks.")
    return texts

In [None]:
texts = text_split(extract_data)
print(f"Legth of extracted data: {len(texts)}")

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/bert-large-nli-mean-tokens"
    )
    return embeddings

In [None]:
embedding = download_hugging_face_embeddings()

In [None]:
query = "What is the main purpose of the document?"
query_embedding = embedding.embed_query(query)
print(f"Query embedding vector length: {len(query_embedding)}")
# --- IGNORE ---

In [None]:
pip install pinecone

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()


In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [None]:
PINECONE_API_KEY

In [None]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
GEMINI_API_KEY

In [None]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)  # reload .env, replacing any existing values
print(os.getenv("OPENAI_API_KEY")[-4:])  # sanity check without exposing the key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")



In [None]:
vector = embedding.embed_query("Hello world")
print(f"Vector length: {len(vector)}")

In [None]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
from pinecone import Pinecone
import pinecone

PINECONE_API_KEY  = PINECONE_API_KEY
pc = Pinecone(api_key=PINECONE_API_KEY, environment="us-west1-gcp")


In [None]:
from pinecone import ServerlessSpec

index_name = "medibots-research"


if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts,
    embedding=embedding,
    index_name=index_name
)


In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3}) 

In [None]:
retriever_docs = retriever.get_relevant_documents("What is Acne?")
retriever_docs

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

chatModel = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-latest",
    google_api_key=os.getenv("GEMINI_API_KEY"),
)


In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate


In [None]:
system_prompt = (

    "You are a helpful medical research assistant. Use the following context to answer the question accurately and concisely. "
    "If the context does not contain the answer, respond with 'The information is not available in the provided documents.'"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])


In [None]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": "What is Acne?"})
print(response["answer"])
