# Question Answering over Documents with Langchain

In [31]:
from langchain.document_loaders import TextLoader, DirectoryLoader, PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI
import os

In [6]:
# ONLY USE IF KEY IS SAVED IN FILE

# Change this path to your key location
path_to_key = "../openai-api-key.txt"

with open(path_to_key) as fo:
    key = fo.readline()
    
os.environ["OPENAI_API_KEY"] = key.strip()

## Q/A with a single document

In [7]:
path_to_file = "./state_of_the_union.txt"
loader = TextLoader(path_to_file)

In [10]:
index = VectorstoreIndexCreator().from_loaders([loader])

Using embedded DuckDB without persistence: data will be transient


In [12]:
query = "What did the president say about Justice Breyer?"
index.query(query)

' The president said that Justice Breyer is an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court, and thanked him for his service.'

In [13]:
index.query_with_sources(query)

{'question': 'What did the president say about Justice Breyer?',
 'answer': ' The president thanked Justice Breyer for his service.\n',
 'sources': './state_of_the_union.txt'}

### Q/A over PDFs

In [16]:
path_to_pdf = "./research_papers/attention_is_all_you_need.pdf"
loader = PyPDFLoader(path_to_pdf)
index = VectorstoreIndexCreator().from_loaders([loader])


Using embedded DuckDB without persistence: data will be transient


In [17]:
query = "What is a transformer model?"
index.query(query)

' A transformer model is an architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. It is used for sequence transduction tasks such as machine translation, textual entailment, and learning task-independent sentence representations.'

## Q/A with multiple documents using VectorStore

In [29]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.indexes.vectorstore import VectorstoreIndexCreator
from langchain.chains import RetrievalQA

In [25]:
directory = "./research_papers"
loader = DirectoryLoader(directory, glob = "*.pdf", loader_cls=PyPDFLoader) # can add loader_cls=TextLoader to change loader type
documents = loader.load()
print(len(documents))

27


In [26]:
text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
texts = text_splitter.split_documents(documents)

In [27]:
embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(texts, embeddings)

Using embedded DuckDB without persistence: data will be transient


In [32]:
qa = RetrievalQA.from_chain_type(llm = OpenAI(), chain_type = "stuff", retriever = docsearch.as_retriever())


In [33]:
query = "What is a GPT model?"
qa.run(query)

' GPT stands for Generative Pre-training Transformer. It is a type of language model that uses an unsupervised approach to pre-train a deep learning model on large amounts of text data. It is based on the Transformer architecture, which was introduced in the paper "Attention Is All You Need". GPT models are used to generate text based on a given input sentence.'