In [132]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings # Import HuggingFaceEmbeddings here
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFaceHub
from huggingface_hub import login
from langchain.vectorstores import Chroma
from dotenv import load_dotenv
import os
import warnings
warnings.filterwarnings('ignore')
load_dotenv()

True

In [133]:
# Load environment variables from the .env file
api_key = os.getenv('/content/drive/MyDrive/Project/Chat_with_PDF/HUGGINGFACEHUB_API_TOKEN')
login(token=api_key)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [134]:
def get_pdf_text(folder_path):
    text = ""

    # Search for all PDF files in the specified folder
    pdf_docs = [file for file in os.listdir(folder_path) if file.endswith('.pdf')]

    for pdf in pdf_docs:
        # Construct the full path to each PDF file
        pdf_path = os.path.join(folder_path, pdf)
        pdf_reader = PdfReader(pdf_path)

        for page in pdf_reader.pages:
            text += page.extract_text()

    return text


def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks


def get_vectorstore(text_chunks):
    # embeddings = OpenAIEmbeddings()
    instructor_model = SentenceTransformer("hkunlp/instructor-xl", device='cpu')

    # Initialize HuggingFaceEmbeddings using the loaded model
    embeddings = HuggingFaceEmbeddings(client=instructor_model)
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore


def get_conversation_chain(vectorstore, model):
    # llm = ChatOpenAI()
    llm = HuggingFaceHub(repo_id=model, model_kwargs={"temperature":0.7, "max_length":1024}, huggingfacehub_api_token=api_key)
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(search_kwargs={"k": 5}), memory=memory)
    return conversation_chain

In [135]:
model = "google/flan-t5-small"
pdf_path = "/content/drive/MyDrive/Project/Chat_with_PDF/Data"
text = get_pdf_text(pdf_path)
chunks = get_text_chunks(text)
vectorstore = get_vectorstore(chunks)
chain = get_conversation_chain(vectorstore, model)

In [136]:
query = """
    Please provide a detailed explanation about the following query.
    The goal is to provide a comprehensive and elaborative answer.
    If the answer is not known, kindly mention that.

    Query: how many person are there?
"""

response = chain.run(query)
print(response)

There are a total of 84 people.
