# Experiment 15

#### Problem Statement:
To implement simple PDF Document search using Open Source Generative AI model.

#### Install Dependencies:

In [1]:
! pip install langchain pypdf faiss-cpu sentence-transformers



#### Code:

In [2]:
# importing required libraries
import os
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings

In [3]:
# setting up the huggingfacehub api token
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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" # this is a temporary token, please use your own token

In [4]:
# loading the pdf document using pyPDF and the document loader
pdf_loader = PyPDFLoader("https://scholarworks.calstate.edu/downloads/vq27zt20r")
pdf_document = pdf_loader.load()

In [5]:
# converting the pdf document to raw text
pdf_text = ''

for i, page in enumerate(pdf_document):
    page_text = page.page_content
    if(page_text != None):
        pdf_text += page_text


In [6]:
# splitting the document into chunks using RecursiveTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap=200,
    length_function=len,

)

# splitting the document into chunks
chunks = text_splitter.split_text(pdf_text)

In [7]:
# loading the openai embeddings
embeddings = HuggingFaceEmbeddings()

In [8]:
# creating the vector store
document_search = FAISS.from_texts(chunks, embeddings)

In [16]:
# loading the Flan-T5 XL model from the huggingface hub

model = HuggingFaceHub(repo_id="google/flan-t5-xl",
                       model_kwargs={"temperature": 1, "max_length": 1000000})



In [10]:
# loading the question answering chain
chain = load_qa_chain(model, chain_type="stuff")

In [21]:
query = "What are the algorithms used in the paper?"
docs = document_search.similarity_search(query)
answer = chain.run(input_documents=docs, question=query)
print(answer)

DQN [11] and A2C [21]
