# 🦜🔗 Langchain Q&A: Book

In [1]:
# !pip install langchain
# !pip install unstructured
# !pip install pdfminer.six

In [2]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

### 1. Load your data

In [3]:
loader = UnstructuredPDFLoader("data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [4]:
data = loader.load()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


In [5]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 201014 characters in your document


### 2. Chunk your data up into smaller documents

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [7]:
print (f'Now you have {len(texts)} documents')

Now you have 240 documents


In [8]:
texts[1]

Document(page_content='FOREWORDData Science touches every aspect of our lives on a daily basis. When we visit the doctor, drive our cars, get on an airplane, or shop for services, Data Science is changing the way we interact with and explore  our world.  Our world is now measured, mapped, and recorded in digital bits. Entire lives, from birth to death, are now catalogued in the digital realm. These data, originating from such diverse sources as connected vehicles, underwater microscopic cameras, and photos we post to social media, have propelled us into the greatest age of discovery humanity has ever known. It is through Data Science that we are unlocking the secrets hidden within these data. We are making discoveries that will forever change how we live and interact with the world around us. The impact of these changes is having a profound effect on humanity. We have propelled ourselves into this age of discovery through our incremental technological improvements. Data Science has bec

### Create embeddings of your documents to get ready for semantic search

In [9]:
#!pip install pinecone-client

In [10]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [11]:
import os
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')  # find at app.pinecone.io
PINECONE_API_ENV = 'us-west4-gcp'

In [12]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [13]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  
    environment=PINECONE_API_ENV 
)
index_name = "langchain2"

In [14]:
# build embeddings
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

### 📖 Query `pinecone` embeddings to get Q&A response

In [30]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [31]:
query = "What was the weight of the ox, and what methods can be used to measure it"
docs = docsearch.similarity_search(query, include_metadata=True)

chain.run(input_documents=docs, question=query)

' The weight of the ox was 1,198 pounds, and the method used to measure it was to collect the guesses of the 787 entrants and compute the mean.'

In [32]:
query = "Are highly complex methods preferred over simpler ones?"
docs = docsearch.similarity_search(query, include_metadata=True)

chain.run(input_documents=docs, question=query)

' No, simpler methods can often provide the same insight and are easier and faster to prototype, implement, and verify.'

In [33]:
query = "What email can I use to contact the author"
docs = docsearch.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query)

' You can email the author at data_science@bah.com.'

In [52]:
from rich.jupyter import print

query = "What are the 5 best skills to teach children, with a short sentence on why. Answer in a list with new lines. Use bold tags for the skills (i.e 1. [bold]bold[/bold]: Response)"
docs = docsearch.similarity_search(query, include_metadata=True)
ret = chain.run(input_documents=docs, question=query)
print(ret)

In [39]:
docs

[Document(page_content='and fill them to avoid surprises. Grammar, spelling and graphics matter; your audience will lose confidence in your analysis if your results look sloppy. ›Where would we head next? No analysis is ever finished, you just run out of resources. Understand and explain what additional measures could  be taken if more resources  are found. » Tips From the ProsBetter a short pencil than a long memory. End every day by documenting where you are; you may learn something along the way. Document what you learned and why you changed your plan. » Tips From the ProsTest your answers with a friendly audience to make sure your ﬁndings hold water. 51Take off the Training Wheels', metadata={}),
 Document(page_content='and fill them to avoid surprises. Grammar, spelling and graphics matter; your audience will lose confidence in your analysis if your results look sloppy. ›Where would we head next? No analysis is ever finished, you just run out of resources. Understand and explain w