In [27]:
import os
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, PyMuPDFLoader
from langchain.document_loaders import DirectoryLoader
from dotenv import load_dotenv

In [29]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [8]:
# Load and process the text files
loader = PyMuPDFLoader('./Origin_of_Species.pdf')
# loader = DirectoryLoader('./new_articles/', glob="./*.txt", loader_cls=TextLoader)

documents = loader.load()

In [11]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [34]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [None]:
# Now we can load the persisted database from disk, and use it as normal. 
# vectordb = Chroma(persist_directory=persist_directory, 
#                   embedding_function=embedding)

In [35]:
retriever = vectordb.as_retriever()

In [36]:
docs = retriever.get_relevant_documents("Chapter IX On the Imperfection of the Geological Record")
docs

  warn_deprecated(


[Document(page_content="vehemently, maintained the immutability of species.  But I have reason to believe that one great\nauthority, Sir Charles Lyell, from further reflexion entertains grave doubts on this subject.  I feel\nhow rash it is to differ from these great authorities, to whom, with others, we owe all our\nknowledge.  Those who think the natural geological record in any degree perfect, and who do not\nattach much weight to the facts and arguments of other kinds given in this volume, will\nundoubtedly at once reject my theory.  For my part, following out Lyell's metaphor, I look at the\nnatural geological record, as a history of the world imperfectly kept, and written in a changing\ndialect; of this history we possess the last volume alone, relating only to two or three countries.  Of\nthis volume, only here and there a short chapter has been preserved; and of each page, only here\nand there a few lines.  Each word of the slowly-changing language, in which the history is", met

In [37]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [38]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

  warn_deprecated(


In [50]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for doc in llm_response["source_documents"]:
        print(doc.page_content)

In [51]:
# full example
query = "what is this book mainly about?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 The book is mainly about the theory of evolution and the various challenges and evidence that support it. 


Sources:
Summary of last and present Chapters -- In these chapters I have endeavoured to show, that if we
make due allowance for our ignorance of the full effects of all the changes of climate and of the
level of the land, which have certainly occurred within the recent period, and of other similar
changes which may have occurred within the same period; if we remember how profoundly
ignorant we are with respect to the many and curious means of occasional transport,--a subject
which has hardly ever been properly experimentised on; if we bear in mind how often a species
may have ranged continuously over a wide area, and then have become extinct in the intermediate
tracts, I think the difficulties in believing that all the individuals of the same species, wherever
located, have descended from the same parents, are not insuperable.  And we are led to this
conclusion, which has been