In [None]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

: 

### Load your data

In [6]:
loader = UnstructuredPDFLoader("ArtsEquityToolkit.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [7]:
data = loader.load()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


In [8]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 227557 characters in your document


### Chunk your data up into smaller documents

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [10]:
print (f'Now you have {len(texts)} documents')

Now you have 234 documents


### Create embeddings of your documents to get ready for semantic search

In [11]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [16]:
import os
OPENAI_API_KEY = None
PINECONE_API_KEY = None
PINECONE_API_ENV = None

In [17]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [18]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "querypdf"

In [19]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [21]:
query = "Who is the author of this book?"
docs = docsearch.similarity_search(query, include_metadata=True)

### Query those docs to get your answer back

In [22]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [23]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [37]:
query = "What are the five equity principles?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [39]:
chain.run(input_documents=docs, question=query)

' The five equity principles are Flexibility and Adaptability, Reflexivity and Relationships, Relevance and Representation, Embeddedness, and Sustainability.'

In [40]:
query = "What is the case study mentioned within 'Embeddedness' in five equity principles?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [41]:
chain.run(input_documents=docs, question=query)

" The case study mentioned within 'Embeddedness' in five equity principles is Scarborough Arts on pages 36 and 43 of the Equity in Practice section of the Arts & Equity Toolkit."

In [47]:
query = "What is the case study mentioned within 'Embeddedness' in five equity principles? Summarize it."
docs = docsearch.similarity_search(query, include_metadata=True)

In [48]:
chain.run(input_documents=docs, question=query)

" The case study mentioned in the Embeddedness principle is about equitable community-engaged arts. It emphasizes the importance of weaving arts into the fabric of community life as an integral thread, rather than an add-on or decoration. It also discusses the need for long-term commitment to change through organizational and programmatic restructuring that more equitably distributes decision-making and authority. Finally, it emphasizes the need to recognize and support the culture that already exists within people's everyday lives and interactions."