In [5]:
import fitz  # PyMuPDF
import os

# Function to extract text from all PDFs in a folder
def extract_text_from_pdfs(folder_path):
    all_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):  # Check if the file is a PDF
            with fitz.open(os.path.join(folder_path, filename)) as doc:
                pdf_text = ""
                for page in doc:
                    pdf_text += page.get_text()  # Extract text from each page
                all_text.append(pdf_text)  # Append extracted text from each PDF
    return all_text

# Specify folder path containing your PDFs
pdf_folder_path = r"C:\Users\Asus\Desktop\Chatbot"
pdf_texts = extract_text_from_pdfs(pdf_folder_path)


In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize text splitter with larger chunk size and overlap for better context capture
splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=100)

all_chunks = []
for text in pdf_texts:
    chunks = splitter.split_text(text)  # Split each PDF's text into chunks
    all_chunks.extend(chunks)  # Add the chunks to the all_chunks list


In [23]:
print(f"Documents passed to the model: {all_chunks}")




In [24]:
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS

# Initialize OllamaEmbeddings (Make sure to use an up-to-date model)
embedding_model = OllamaEmbeddings(model="llama3.2")

# Generate the FAISS vector store from the chunks
vectorstore = FAISS.from_texts(all_chunks, embedding=embedding_model)

# Save the vector store locally for later use
vectorstore.save_local("marbet_vectorstore")


In [None]:
from langchain.chains import RetrievalQA
from langchain_ollama import ChatOllama

from langchain.prompts import ChatPromptTemplate

# Define the language model and the prompt template
llm = ChatOllama(
    base_url="http://localhost:11434", 
    model="llama3.2"
)

# Use the FAISS vector store as the retriever
retriever = vectorstore.as_retriever()

# Define the template for the chatbot response
template = """
You are a helpful assistant. Answer the following question based on the context provided. 

Here is some context about the documents:
1. Activities and excursions
2. Packing list
3. Scenic Eclipse A-Z Guide
4. Tutorials and additional documents
5. Travel Itinerary

{context}

Now, answer the user's question:
"""

# template =  """
# You are a helpful assistant. Answer the question based on the following documents:

# {context}

# Now, answer the user's question:
# """


prompt = ChatPromptTemplate.from_template(template)

# Set up the RetrievalQA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)


In [46]:
docs = retriever.get_relevant_documents('What document must you upload during an ESTA or eTA application?')
for doc in docs:
    print(doc.page_content)

If you have an existing and valid eTA travel authorization, you do not need a new application.
You will need the following documents for the application:
https://www.canada.ca/en/immigration-refugees-citizenship/services/visit-canada/eta/apply-de.html
directly to the Government of Canada. 
Payment can only be made by credit card.
three separate forms for a family of three.
Please make sure that you print out your receipt immediately. It is not possible to print it out at a later date. A copy can also no 
longer be issued.
on board and at the end of your journey for the border authorities.
The eTA travel authorization is valid for up to five years after it is issued or until your passport expires.


In [None]:
query = input("Ask your question: ").strip()

if query:
    # Retrieve context based on actual user query
    docs = retriever.get_relevant_documents(query)
    for i, doc in enumerate(docs):
        print(f"Document {i+1}:", doc.page_content)

    # Run the QA chain with actual query
    # Run the QA chain with actual query
    result = qa.invoke({"query": query})
    print("Answer:", result["result"])
else:
    print("No question was asked. Please try again.")

Document 1: any other VISA provider to ensure that the application and approval process runs smoothly.
Document 2: any other VISA provider to ensure that the application and approval process runs smoothly.
Document 3: you are welcome to take with 
you after your stay.
Bars
The Scenic Eclipse I offers a diverse 
selection of bars and lounges. You 
can find the opening times in your 
Daily Wonder TV program.
Document 4: currently costs $21 (as of February 2024) (approx. EUR 20.00) and is available on the official website:
https://esta.cbp.dhs.gov/ directly to the CBP (border authority).
Answer: I don't see a question in your message. Could you please rephrase or ask a specific question related to the provided context about Scenic Eclipse? I'll be happy to help once I understand what you're asking.


In [None]:
# Interactive loop for asking questions
while True:
    # Ask the user for input
    question = input("I'm here to help you! What is your question? (type 'exit' to quit): ")
    
    # Exit condition
    if question.lower() == 'exit':
        break
    
    # Get the answer using the QA system
    result = qa.invoke({"documents": all_chunks, "query": question})
    
    # Print the result
    print(result["result"])

#     # Retrieve documents based on the question
#     retrieved_docs = retriever.invoke({"query": question}) 
#     print(f"Retrieved documents: {retrieved_docs}")  # Debug: print retrieved documents

#  # Ensure 'query' key is used and 'documents' are passed correctly
#     prompt_to_model = prompt.format(context=" ".join([doc.page_content for doc in retrieved_docs]))  # Ensure proper context format
#     print(f"Prompt being sent to the model: {prompt_to_model}")  # Debug: print the generated prompt


#     # Ensure 'query' key is used and 'documents' are passed correctly
#     result = qa.invoke({"documents": retrieved_docs, "query": question})
    
#     # Print the result (make sure to check the model output)
#     print(f"Raw output: {result}")  # Debug: print raw model output

#     # Print only the result part to show the final answer
#     print(f"Answer: {result['result']}")


Question asked: What activities are offered in Boston during the trip?
