In [1]:
import os
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

ModuleNotFoundError: No module named 'langchain'

In [None]:
import os
from langchain_community.document_loaders import PyMuPDFLoader

folder_path = "input"  # Path to the "docs" folder

# Create an empty list to store the loaded documents
documents = []

# Iterate over each file in the folder
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if os.path.isfile(file_path) and file_name.endswith(".pdf"):
        loader = PyMuPDFLoader(file_path)
        loaded_documents = loader.load()
        documents.extend(loaded_documents)

# Now you have all the loaded documents in the "documents" list

In [None]:
len(documents)

In [None]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
len(texts)

In [None]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [None]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [None]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQAWithSourceChain.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
# Cite sources
def process_llm_response(llm_response):
    print("Question: " + llm_response['question'])
    print("Answer: " + llm_response['answer'])
    print("\nSources:")
    for source in llm_response["source_documents"]:
        print("File: " + source.metadata['file_path'])
        print("Page Number: " + str(source.metadata['page']))

# full example
query = "Give me the detailed configurations to SSH into servers using Putty."
llm_response = qa_chain(query)
process_llm_response(llm_response)