<a href="https://colab.research.google.com/github/mahihubb/Chat-with-pdf-and-website/blob/master/chat_with_pdf's__and_website.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -r requirements.txt

In [None]:
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS  # Facebook AI similarity search
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
import docx
import os
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_core.callbacks import StdOutCallbackHandler
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


def main():
    load_dotenv()
    print("Ask Your PDF")

    conversation = None
    chat_history = None
    process_complete = False

    # File upload simulation
    uploaded_files = input("Enter the paths of your files separated by commas: ").split(',')
    process = input("Process files? (yes/no): ").strip().lower() == 'yes' #To end or continue with the process...

    if process:
        files_text = get_files_text(uploaded_files)
        text_chunks = get_text_chunks(files_text)
        vectorstore = get_vectorstore(text_chunks)
        conversation = get_conversation_chain(vectorstore)
        process_complete = True

    if process_complete:
        while True:
            user_question = input("Ask a question about your files (or type 'exit' to quit): ").strip()
            if user_question.lower() == 'exit':
                break
            handle_user_input(user_question, conversation)

def get_files_text(uploaded_files):
    text = ""
    for uploaded_file in uploaded_files:
        file_extension = os.path.splitext(uploaded_file)[1]
        if file_extension == ".pdf":
            text += get_pdf_text(uploaded_file)
        elif file_extension == ".docx":
            text += get_docx_text(uploaded_file)
        else:
            text += get_csv_text(uploaded_file)
    return text

def get_pdf_text(pdf_path):
    pdf_reader = PdfReader(pdf_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def get_docx_text(file_path):
    doc = docx.Document(file_path)
    all_text = [docpara.text for docpara in doc.paragraphs]
    return ' '.join(all_text)

def get_csv_text(file_path):
    # Placeholder for CSV text extraction
    return "CSV processing is not implemented yet."

def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=900,
        chunk_overlap=100,
        length_function=len
    )
    return text_splitter.split_text(text)

def get_vectorstore(text_chunks):
    embeddings = HuggingFaceEmbeddings()
    knowledge_base = FAISS.from_texts(text_chunks, embeddings)
    return knowledge_base

def get_conversation_chain(vectorstore):
    os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'
    llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature": 0.7, "max_length": 64})
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    return ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory,
        verbose=False  # Suppress debug output
    )


def handle_user_input(user_question, conversation):
    response = conversation({'question': user_question})
    chat_history = response['chat_history']

    print("\nChat History:")
    for i, message in enumerate(chat_history):
        if i % 2 == 0:
            print(f"User: {message.content}")
        else:
            print(f"Bot: {message.content}")


if __name__ == '__main__':
    main()


Ask Your PDF
Enter the paths of your files separated by commas: /content/drive/MyDrive/Chat with PDF and Website/Sample2.pdf
Process files? (yes/no): yes
Ask a question about your files (or type 'exit' to quit): what is task1?

Chat History:
User: what is task1?
Bot: Implement a Retrieval-Augmented Generation (RAG) pipeline that allows users to interact with semi-structured data in multiple PDF files
Ask a question about your files (or type 'exit' to quit): what is task 2?

Chat History:
User: what is task1?
Bot: Implement a Retrieval-Augmented Generation (RAG) pipeline that allows users to interact with semi-structured data in multiple PDF files
User: what is task 2?
Bot: Chat with Website Using RAG Pipeline
Ask a question about your files (or type 'exit' to quit): exit


In [18]:
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever
from bs4 import BeautifulSoup
import os
import requests

load_dotenv()

def get_page_title(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.title.string if soup.title else "No title found"
    return title

from langchain_core.documents import Document

def get_vectorstore_from_url(url):
    loader = WebBaseLoader(url)
    document = loader.load()
    soup = BeautifulSoup(document[0].page_content, 'html.parser')
    title = soup.title.string if soup.title else "Title not found"

    document_chunks = [Document(page_content=title)]

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    document_chunks.extend(text_splitter.split_documents(document))

    embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
    vector_store = Chroma.from_documents(document_chunks, embeddings)

    return vector_store

def load_model_from_huggingface(model_name="t5-small"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return model, tokenizer

def generate_text_with_model(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def get_response(user_input, vector_store, model, tokenizer):
    retriever = vector_store.as_retriever()
    docs = retriever.get_relevant_documents(user_input)

    if not docs:
        return "I couldn't find any relevant information."

    combined_docs = "\n".join([doc.page_content for doc in docs])
    response = generate_text_with_model(combined_docs + "\nQuestion: " + user_input, model, tokenizer)

    return response

def main():
    print("Chat with Websites\n")

    model_name = "t5-small"
    print(f"Loading model: {model_name} from Hugging Face...")
    local_llm_model, local_llm_tokenizer = load_model_from_huggingface(model_name)

    website_url = "https://www.google.com"
    if not website_url:
        print("Please provide a valid URL.")
        return

    print("Loading the website content...")
    vector_store = get_vectorstore_from_url(website_url)
    print("Content loaded successfully.\n")

    chat_history = [AIMessage(content="Hello, I am a bot. How can I help you?")]
    print(f"ai: {chat_history[-1].content}")

    while True:
        user_query = input("human: ")
        if user_query.lower() in ["exit", "quit"]:
            print("ai: Exiting. Goodbye!")
            break

        if 'title' in user_query.lower():
            title = get_page_title(website_url)
            print(f"ai: The title of the website is {title}")
            chat_history.append(HumanMessage(content=user_query))
            chat_history.append(AIMessage(content=f"The title of the website is {title}"))
        else:
            try:
                response = get_response(user_query, vector_store, local_llm_model, local_llm_tokenizer)
                chat_history.append(HumanMessage(content=user_query))
                chat_history.append(AIMessage(content=response))
                print(f"ai: {response}")
            except Exception as e:
                print(f"ai: Error during response generation: {e}")

if __name__ == "__main__":
    main()


Chat with Websites

Loading model: t5-small from Hugging Face...
Loading the website content...
Content loaded successfully.

ai: Hello, I am a bot. How can I help you?
ai: The title of the website is Google
ai: Google 2024 - Privacy - Terms GoogleSearch Images Maps Play YouTube News G
ai: Question: exit
human: exit
ai: Exiting. Goodbye!
