In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

from dotenv import dotenv_values

In [4]:
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")
loader = PyPDFLoader("./The-Field-Guide-to-Data-Science.pdf")


In [5]:
data = loader.load()

In [9]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 110 document(s) in your data
There are 0 characters in your document


Chunk your data into smaller documents

In [10]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [11]:
print (f'Now you have {len(texts)} documents')

Now you have 133 documents


Create embeddings of your documents to get ready for semantic search

In [12]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [13]:
dotenv_values(".env")

OrderedDict([('OPENAI_API_KEY',
              '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'),
             ('PINECONE_API_KEY', '48cc812a-3ad1-49bf-a6ea-84c603934573'),
             ('PINECONE_API_ENV', 'us-west1-gcp-free')])

In [14]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV') # You may need to switch with your env

In [15]:
print('ka', OPENAI_API_KEY)

ka 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


In [16]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [17]:
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchainun" # put in the name of your pinecone index here

In [19]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [20]:
query = "What are examples of good data science teams?"
docs = docsearch.similarity_search(query)

In [21]:
# Here's an example of the first document that was returned
print(docs[0].page_content[:450])

39 Start Here for the Basics 39 Start Here for the BasicsSuccess Starts at the T op
Data Science teams, no matter how they are deployed, must have 
sponsorship. /T_hese can start as grass roots eﬀorts by a few folks to 
start tackling hard problems, or as eﬀorts directed by the CEO. 
Depending on the complexity of the organization, direction from top-
down for large organizations is the best for assuaging fears and doubts 
of these new groups. 
D


Query those docs to get your answer back


In [22]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [23]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [24]:
query = "What is the collect stage of data maturity?"
docs = docsearch.similarity_search(query)

In [25]:
chain.run(input_documents=docs, question=query)


' The collect stage of data maturity focuses on collecting internal or external datasets. Gathering sales records and corresponding weather data is an example of the collect stage.'