In [1]:
## building a sample vector db
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
loader = TextLoader("speech.txt")
data = loader.load()

data

[Document(metadata={'source': 'speech.txt'}, page_content='Heisenberg enjoyed classical music and was an accomplished pianist, and playing for others was a large part of his social life.[3] During the late 1920s and early 1930s he would often play music and dance at the Berlin home of his aristocratic student Carl Friedrich von Weizsäcker, during which time he carried on a courtship with Carl\'s high-school-age sister Adelheid, which scandalized her parents and led to him being unwelcome at their home for a time.[26] Years later his interest in music also led to meeting his future wife. In January 1937, Heisenberg met Elisabeth Schumacher (1914–1998) at a private music recital. Elisabeth was the daughter of a well-known Berlin economics professor, and her brother was the economist E. F. Schumacher, author of Small Is Beautiful. Heisenberg married her on 29 April. Fraternal twins Maria and Wolfgang were born in January 1938, whereupon Wolfgang Pauli congratulated Heisenberg on his "pair

In [3]:
# SPlit
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0)
splits = text_splitter.split_documents(data)

In [4]:
embeddings = OllamaEmbeddings(model="gemma:2b", base_url="http://localhost:11434")
vectordb = Chroma.from_documents(documents=splits, embedding=embeddings)
vectordb

<langchain_chroma.vectorstores.Chroma at 0x10f052660>

In [5]:
# Query it
query = "What is the main idea of the speech?"

docs = vectordb.similarity_search(query)
docs[0].page_content

'Sauter, John C. Slater, Edward Teller, John Hasbrouck van Vleck, Victor Frederick Weisskopf, Carl Friedrich von Weizsäcker, Gregor Wentzel, and Clarence Zener.[42]'

In [7]:
# FLushing to the disk
vectordb=Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory="./chroma_db")



In [8]:
# Load from disk

db2 = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
docs = db2.similarity_search(query)
print(docs[0].page_content)

Sauter, John C. Slater, Edward Teller, John Hasbrouck van Vleck, Victor Frederick Weisskopf, Carl Friedrich von Weizsäcker, Gregor Wentzel, and Clarence Zener.[42]


In [9]:
### Retriever option
retriever = vectordb.as_retriever()
retriever.invoke("What is the main idea of the speech?")[0].page_content

'Sauter, John C. Slater, Edward Teller, John Hasbrouck van Vleck, Victor Frederick Weisskopf, Carl Friedrich von Weizsäcker, Gregor Wentzel, and Clarence Zener.[42]'