<a href="https://colab.research.google.com/github/mahihubb/Chat-with-pdf-and-website/blob/master/chat_with_pdf's__and_website.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -r requirements.txt

In [37]:
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS  # Facebook AI similarity search
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
import docx
import os
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_core.callbacks import StdOutCallbackHandler
from sentence_transformers import SentenceTransformer

def main():
    load_dotenv()
    print("Ask Your PDF")

    conversation = None
    chat_history = None
    process_complete = False

    # File upload simulation
    uploaded_files = input("Enter the paths of your files separated by commas: ").split(',')
    process = input("Process files? (yes/no): ").strip().lower() == 'yes' #To end or continue with the process...

    if process:
        files_text = get_files_text(uploaded_files)
        text_chunks = get_text_chunks(files_text)
        vectorstore = get_vectorstore(text_chunks)
        conversation = get_conversation_chain(vectorstore)
        process_complete = True

    if process_complete:
        while True:
            user_question = input("Ask a question about your files (or type 'exit' to quit): ").strip()
            if user_question.lower() == 'exit':
                break
            handle_user_input(user_question, conversation)

def get_files_text(uploaded_files):
    text = ""
    for uploaded_file in uploaded_files:
        file_extension = os.path.splitext(uploaded_file)[1]
        if file_extension == ".pdf":
            text += get_pdf_text(uploaded_file)
        elif file_extension == ".docx":
            text += get_docx_text(uploaded_file)
        else:
            text += get_csv_text(uploaded_file)
    return text

def get_pdf_text(pdf_path):
    pdf_reader = PdfReader(pdf_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def get_docx_text(file_path):
    doc = docx.Document(file_path)
    all_text = [docpara.text for docpara in doc.paragraphs]
    return ' '.join(all_text)

def get_csv_text(file_path):
    # Placeholder for CSV text extraction
    return "CSV processing is not implemented yet."

def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=900,
        chunk_overlap=100,
        length_function=len
    )
    return text_splitter.split_text(text)

def get_vectorstore(text_chunks):
    embeddings = HuggingFaceEmbeddings()
    knowledge_base = FAISS.from_texts(text_chunks, embeddings)
    return knowledge_base

def get_conversation_chain(vectorstore):
    handler = StdOutCallbackHandler()
    os.environ['HUGGINGFACEHUB_API_TOKEN']='Your_api_key'  #it shoulld be replaced with your own api key..
    llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature": 5, "max_length": 64})
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory,
        callbacks=[handler]
    )
    return conversation_chain

def handle_user_input(user_question, conversation):
    response = conversation({'question': user_question})
    chat_history = response['chat_history']

    print("\nChat History:")
    for i, message in enumerate(chat_history):
        if i % 2 == 0:
            print(f"User: {message.content}")
        else:
            print(f"Bot: {message.content}")


if __name__ == '__main__':
    main()


Ask Your PDF
Enter the paths of your files separated by commas: /content/drive/MyDrive/Chat with PDF and Website/Sample2.pdf
Process files? (yes/no): yes


  embeddings = HuggingFaceEmbeddings()


Ask a question about your files (or type 'exit' to quit): what is task2


[1m> Entering new ConversationalRetrievalChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Task 1: Chat with PDF Using RAG Pipeline Overview The goal is to implement a Retrieval-Augmented Generation (RAG) pipeline that allows users to interact with semi-structured data in multiple PDF ﬁles. The system should extract, chunk, embed, and store the data for eFicient retrieval. It will answer user queries and perform comparisons accurately, leveraging the selected LLM model for generating responses. Functional Requirements 1. Data Ingestion • Input: PDF ﬁles containing semi-structured data. • Process: o Extract text and relevant structured information fr

In [38]:
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from bs4 import BeautifulSoup
import os

load_dotenv()
from bs4 import BeautifulSoup
import requests

def get_page_title(url):
    # Fetch the page content
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract the title
    title = soup.title.string if soup.title else "No title found"
    return title

# Function to load the vector store
from langchain_core.documents import Document

def get_vectorstore_from_url(url):
    # Get the text in document form
    loader = WebBaseLoader(url)
    document = loader.load()

    # Extract title using BeautifulSoup
    soup = BeautifulSoup(document[0].page_content, 'html.parser')  # Parse HTML
    title = soup.title.string if soup.title else "Title not found"  # Get title

    # Add title as a separate document chunk for better retrieval
    document_chunks = [Document(page_content=title)]  # Create a Document object for the title

    # Split the rest of the document into chunks
    text_splitter = RecursiveCharacterTextSplitter()
    document_chunks.extend(text_splitter.split_documents(document))  # Add other chunks

    # Create a vectorstore from the chunks using HuggingFaceEmbeddings
    embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")  # Use a local model for embeddings
    vector_store = Chroma.from_documents(document_chunks, embeddings)

    return vector_store

def load_model_from_huggingface(model_name="t5-small"):  # Default model as t5-small
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return model, tokenizer

# Replace the previous retriever chain
def get_context_retriever_chain(vector_store, local_llm_model, local_llm_tokenizer):
    retriever = vector_store.as_retriever()
    prompt = ChatPromptTemplate.from_messages([
        MessagesPlaceholder(variable_name="chat_history"),
        ("user", "{input}"),
    ])

    def generate_text(text):
        if not isinstance(text, str):
            text = str(text)  # Ensure text is a string
        inputs = local_llm_tokenizer(text, return_tensors="pt", truncation=True)
        outputs = local_llm_model.generate(**inputs)
        return local_llm_tokenizer.decode(outputs[0], skip_special_tokens=True)

    retriever_chain = create_history_aware_retriever(generate_text, retriever, prompt)
    return retriever_chain

# Main function to handle the response
def get_response(user_input, vector_store, chat_history, local_llm_model, local_llm_tokenizer):
    retriever_chain = get_context_retriever_chain(vector_store, local_llm_model, local_llm_tokenizer)
    if retriever_chain is None:
        raise ValueError("Retriever chain could not be created.")

    conversation_rag_chain = get_conversational_rag_chain(retriever_chain, local_llm_model, local_llm_tokenizer)
    if conversation_rag_chain is None:
        raise ValueError("Conversational retrieval chain could not be created.")

    response = conversation_rag_chain.invoke({
        "chat_history": chat_history,
        "input": user_input
    })

    # Assuming the response is a list, handle appropriately
    if isinstance(response, list):
        ai_response = response[0] if response else "I'm not sure how to answer that."
    else:
        ai_response = response.get('answer', "I'm not sure how to answer that.")

    return ai_response


# Main program
def main():
    print("Chat with Websites\n")

    # Load model and tokenizer from Hugging Face
    model_name = "t5-small"
    print(f"Loading model: {model_name} from Hugging Face...")
    local_llm_model, local_llm_tokenizer = load_model_from_huggingface(model_name)

    website_url = "https://google.com"
    if not website_url:
        print("Please provide a valid URL.")
        return

    print("Loading the website content...")
    vector_store = get_vectorstore_from_url(website_url)
    print("Content loaded successfully.\n")

    chat_history = [AIMessage(content="Hello, I am a bot. How can I help you?")]
    print(f"ai: {chat_history[-1].content}")

    while True:
        user_query = input("human: ")
        if user_query.lower() in ["exit", "quit"]:
            print("ai: Exiting. Goodbye!")
            break

        if 'title' in user_query.lower():
            title = get_page_title(website_url)
            print(f"ai: The title of the website is {title}")
            chat_history.append(HumanMessage(content=user_query))
            chat_history.append(AIMessage(content=f"The title of the website is {title}"))
        else:
            try:
                response = get_response(user_query, vector_store, chat_history, local_llm_model, local_llm_tokenizer)
                chat_history.append(HumanMessage(content=user_query))
                chat_history.append(AIMessage(content=response))
                print(f"ai: {response}")
            except Exception as e:
                print(f"An error occurred: {e}")


if __name__ == "__main__":
    main()




Chat with Websites

Loading model: t5-small from Hugging Face...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Loading the website content...
Content loaded successfully.

ai: Hello, I am a bot. How can I help you?
human: what is the title of the website
ai: The title of the website is Google
human: exit
ai: Exiting. Goodbye!
