In [31]:
import os
import dotenv

dotenv_path = dotenv.find_dotenv()
dotenv.load_dotenv(dotenv_path)

False

In [32]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

In [33]:
# this is required since Juypyter notebook is not async by default.
import nest_asyncio
nest_asyncio.apply()

## Load data

In [34]:
docs="/teamspace/studios/this_studio/retrieval-augmented-generation/data/"
loader = DirectoryLoader(docs, glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()

## Split document into nodes

In [35]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [36]:
len(texts)

83

In [37]:
texts[3]

Document(page_content="get to the next round of funding Beyond just that product Market fit so product Market fit is just one step just one\nstep along the way so the the challenge is something we call the The Product Company Gap and I\nfigured I'd you know since I I actually started a company I figured I'd tell you about a company that actually failed to get across that Gap um and that might be\nuseful I started a company called padient um it was a mobile payments company QR code payments before it was\nhappening in China and Korea like we actually had to write we had to code our own QR Code Reader we hired a satellite\nimagery engineer to build our own QR Reader from scratch the idea was hey we\ncould use QR codes instead of credit cards to pay for stuff um we knew like we had we started\ncompanies before and we knew that it was going to be hard to build a company a direct a consumer company how are you", metadata={'source': '/teamspace/studios/this_studio/retrieval-augmented-generat

In [39]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db1'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [40]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [41]:
# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)

In [42]:
retriever = vectordb.as_retriever()

In [43]:
# docs = retriever.get_relevant_documents("What is Product Company Gap?")
docs = retriever.invoke("What is Product Company Gap?")


In [44]:
len(docs)

4

In [45]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [46]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [47]:
# full example
query = "What is Product Company Gap?"
llm_response = qa_chain.invoke(query)
process_llm_response(llm_response)

 The Product Company Gap is a term used to describe the challenges and obstacles that a company must overcome in order to successfully bring a product to market and achieve significant growth and success. This includes considerations such as product-market fit, go-to-market strategies, pricing, and overall business model. 


Sources:
/teamspace/studios/this_studio/retrieval-augmented-generation/data/part1.txt
/teamspace/studios/this_studio/retrieval-augmented-generation/data/part1.txt
/teamspace/studios/this_studio/retrieval-augmented-generation/data/part1.txt
/teamspace/studios/this_studio/retrieval-augmented-generation/data/part2.txt


# Caution

On lightning.ai, deleting the database seems to cause an issue.

In [None]:
# To cleanup, you can delete the collection

# vectordb.delete_collection()
# vectordb.persist()
# delete the directory
# !rm -rf db/