In [2]:
import fitz  # PyMuPDF
import os

# Function to extract text from all PDFs in a folder
def extract_text_from_pdfs(folder_path):
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):  # Check if the file is a PDF
            with fitz.open(os.path.join(folder_path, filename)) as doc:
                pdf_text = ""
                for page in doc:
                    pdf_text += page.get_text()  # Extract text from each page
                all_text.append(pdf_text)  # Append extracted text from each PDF
    return all_text

# Specify folder path containing your PDFs
pdf_folder_path = r"C:\Users\Asus\Desktop\Chatbot"
pdf_texts = extract_text_from_pdfs(pdf_folder_path)


In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize text splitter with larger chunk size and overlap for better context capture
splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=100)

all_chunks = []
for text in pdf_texts:
    chunks = splitter.split_text(text)  # Split each PDF's text into chunks
    all_chunks.extend(chunks)  # Add the chunks to the all_chunks list


In [17]:
print(f"Documents passed to the model: {all_chunks}")




In [None]:
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS

# Initialize OllamaEmbeddings (Make sure to use an up-to-date model)
embedding_model = OllamaEmbeddings(model="llama3.2")

# Generate the FAISS vector store from the chunks
vectorstore = FAISS.from_texts(all_chunks, embedding=embedding_model)

# Save the vector store locally for later use
vectorstore.save_local("marbet_vectorstore")


In [11]:
from langchain.chains import RetrievalQA
from langchain_ollama import ChatOllama

from langchain.prompts import ChatPromptTemplate

# Define the language model and the prompt template
llm = ChatOllama(
    base_url="http://localhost:11434",  # This is the correct base URL for your Ollama model
    model="llama3.2"
)

# Use the FAISS vector store as the retriever
retriever = vectorstore.as_retriever()

# Define the template for the chatbot response
template = """
You are a helpful assistant. Answer the following question based on the context provided. 

Here is some context about the documents:
1. Activities and excursions
2. Packing list
3. Scenic Eclipse A-Z Guide
4. Tutorials and additional documents
5. Travel Itinerary

{context}

Now, answer the user's question:
"""

# template =  """
# You are a helpful assistant. Answer the question based on the following documents:

# {context}

# Now, answer the user's question:
# """


prompt = ChatPromptTemplate.from_template(template)

# Set up the RetrievalQA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)


In [12]:
# Interactive loop for asking questions
while True:
    # Ask the user for input
    question = input("I'm here to help you! What is your question? (type 'exit' to quit): ")
    
    # Exit condition
    if question.lower() == 'exit':
        break
    
    # # Get the answer using the QA system
    # result = qa.invoke({"documents": all_chunks, "query": question})
    
    # # Print the result
    # print(result["result"])

    # Retrieve documents based on the question
    retrieved_docs = retriever.invoke({"query": question}) 
    print(f"Retrieved documents: {retrieved_docs}")  # Debug: print retrieved documents

 # Ensure 'query' key is used and 'documents' are passed correctly
    prompt_to_model = prompt.format(context=" ".join([doc.page_content for doc in retrieved_docs]))  # Ensure proper context format
    print(f"Prompt being sent to the model: {prompt_to_model}")  # Debug: print the generated prompt


    # Ensure 'query' key is used and 'documents' are passed correctly
    result = qa.invoke({"documents": retrieved_docs, "query": question})
    
    # Print the result (make sure to check the model output)
    print(f"Raw output: {result}")  # Debug: print raw model output

    # Print only the result part to show the final answer
    print(f"Answer: {result['result']}")


Retrieved documents: [Document(id='1e9bfafe-c170-4090-9ba4-22749bb1df1a', metadata={}, page_content='Code input\nYou will receive a 4-digit code by e-mail, which you enter in the following window and confirm with [ENTER CODE].\nEnter personal data\nPlease state whether you are known under any other names (e.g. stage names) and whether you have ever held a passport or identity \ncard from another country.\nSocial media\nOnly optional to fill in.\nGE/NEXUS/SENTRI membership\nPlease select [NO].\nParents\nPlease enter the names of your parents.\nInformation about your professional career\nPlease indicate whether you currently have an employer and enter the information.\nTravel information\nPlease click [NO] when asked if you are in transit.\nIndicate your location in the USA. Please enter the data of the ship:\n1) Name: M.V. Scenic Eclipse\n2) Address: 4000 Hollywood Blvd - Apartment No. Suite 625\n3) City: Hollywood\n4) State: Florida\n5) Country code: USA (+1)\n6) Telephone number: 857-