In [58]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

## Load your data

In [45]:
loader = UnstructuredPDFLoader("/Users/rloredon/Downloads/The-Field-Guide-to-Data-Science.pdf")


In [46]:
import cv2

In [47]:
data = loader.load()

In [48]:
# let's print some info about our document
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your sample document')
print (f'Here is a sample: {data[0].page_content[:200]}')


You have 1 document(s) in your data
There are 156594 characters in your sample document
Here is a sample: T H E F I ELD G U I D E

to D A T A S C I E N C E

© COPYRIGHT 2013 BOOZ ALLEN HAMILTON INC. ALL RIGHTS RESERVED.

FOREWORDEvery aspect of our lives, from life-saving disease treatments, to national s


## Chunking our "Document" stored in the data variable up into smaller documents

In [49]:
# We'll split our data into chunks around 500 characters each with a 50 character overlap. These are relatively small.
text_splitter = RecursiveCharacterTextSplitter (chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(data)

In [50]:
# Let's see how many small chunks we have
print (f'Now you have {len(texts)} documents')

Now you have 441 documents


### let's print one just to see , say the 250th one

In [27]:
texts[250]

Document(page_content='the diagram to the appropriate class of analytic techniques and provide recommendations for a few techniques to consider.(cid:110)(cid:3)TIP: There are several situations where dimensionality reduction may be needed: ›Models fail to converge ›Models produce results equivalent to random chance ›You lack the computational power to perform operations across the feature space ›You do not know which aspects of the data are the most important(cid:111)(cid:3)(cid:3)Feature Extraction is a broad topic', metadata={'source': '/Users/rloredon/Downloads/The-Field-Guide-to-Data-Science.pdf'})

## Create embeddings of your documents to get ready for semantic search

In [51]:
from langchain.vectorstores import Chroma
from langchain_pinecone.vectorstores import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

In [52]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [53]:
# Then we'll get our embeddings engine going. We'll use OpenAI's ada today.
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

### Using Pinecone for embeddings

In [54]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
#connect to pinecone

#we go create an index named "langchain1" in pinecone api (choose 1536 dimensions, for working with openai)
index_name = "langchain1"

#get our locally generated openai embeddings and pass them over to pinecone vectorstore in the cloud. the number of vectors = number of documents
vectorstore = PineconeVectorStore.from_texts([t.page_content for t in texts],embedding=embeddings,index_name=index_name)

### Let's ask our book questions

In [57]:
query = "What are the characteristics of a good data science team ?"
query_results = vectorstore.similarity_search(query)

[Document(page_content='with other Data Science teams is critical to the success.'),
 Document(page_content='Building a team starts with identifying existing staff within an organization who have a high aptitude for Data Science. Good candidates will have a formal background in any of the three foundational technical skills we mentioned, and will most importantly have the personality traits necessary for Data Science. They may often have advanced (masters or higher) degrees, but not always. The very first staff you identify should also have good leadership traits and a sense of purpose for the'),
 Document(page_content='Performing Data Science requires the extraction of timely, actionable information from diverse data sources to drive data products. Examples of data products include answers to questions such as: “Which of my products should I advertise more heavily to increase profit? How can I improve my compliance program, while reducing costs? What manufacturing process change will 

### Let's chain those search results  into OpenAI to get our answer back in natural language?

In [61]:
from langchain_openai.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain


In [62]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [None]:
query = 