# Getting started with ChromaDB 

Delete the contents of /db folder and restart the notebook.  Otherwise, this will duplicate rows in the db.

## References

- [YouTube Course](https://youtu.be/8KrTO9bS91s?si=rEKPcDYKbav56GQj)
- [GitHub Repo](https://github.com/entbappy/Complete-Generative-AI-Course-on-YouTube/blob/main/Vector%20Database/1.Chroma_DB_demo.ipynb)

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

In [None]:
# lightning_path = '/teamspace/studios/this_studio/woodshed/ai/notebooks/ChromaDB/data/articles'
lightning_path = '/Users/mpaz/github/aiforge/notebooks/ChromaDB/data/articles'
loader = DirectoryLoader(lightning_path, glob = "./*.txt", loader_cls= TextLoader)
document = loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
text = text_splitter.split_documents(document)

In [None]:
len(text)
text[1]

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

persist_directory = '/Users/mpaz/github/aiforge/db'

embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(
    documents=text,
    embedding=embedding
)

In [None]:
# persist the db to disk
vectordb.persist()
vectordb = None

In [None]:

# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(
    embedding_function=embedding
)

In [None]:

retriever = vectordb.as_retriever()

docs = retriever.get_relevant_documents(
    "How much money did Microsoft raise?"
)


In [None]:

len(docs)

In [None]:
docs

In [None]:

retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [None]:

retriever.search_type


In [None]:

retriever.search_kwargs

# Make a chain

In [None]:
from langchain.chains import RetrievalQA
llm=OpenAI()

# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

In [None]:

## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "How much money did Microsoft raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
# break it down
query = "What is the news about Pando?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [1]:
from aiforge.config import config