# Building LLM applications: Notebook 07

# Vector DBs

## Initialize

In [1]:
import os
import dotenv
import random

from pathlib import Path

from langchain_core.documents import Document
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_chroma import Chroma


In [2]:
MODEL = 'llama3.2:3b-instruct-fp16'
EMBEDING_MODEL = 'nomic-embed-text'

DATA_DIR = Path('.') / 'data'
CHROMA_PATH = DATA_DIR / 'chroma'
DOCS_DIR = DATA_DIR / 'books'

CHROMA_PATH.mkdir(parents=True, exist_ok=True)

_db_ids = None

In [3]:
# Read fro `.env` file
dotenv.load_dotenv()

OLLAMA_URL = os.getenv('OLLAMA_URL')
print(f"Using Ollama server: {OLLAMA_URL if OLLAMA_URL else 'local'}")

Using Ollama server: local


## Vector DB

In [4]:
embedding = OllamaEmbeddings(model=EMBEDING_MODEL, base_url=OLLAMA_URL)
vdb = Chroma(persist_directory=str(CHROMA_PATH), embedding_function=embedding)

### Add documents

In [5]:
def get_db_ids(force=False):
    """ Get the ids of the documents in the database """
    global _db_ids
    if _db_ids is None or force:
        items = vdb.get(include=[])
        _db_ids = set(items['ids'])
    return _db_ids

In [6]:
def add_paragraph_to_vdb(file: Path, paragraph_num: int, paragraph: str):
    """
    Add a paragraph to the database 
    Returns 1 if the paragraph was added, 0 otherwise
    """
    # Ignore empty or short paragraphs
    if not paragraph.strip() or len(paragraph.strip().split('\n')) < 3:
        return 0
    # Skipt already added ids
    doc_id = f"{file.stem}-{paragraph_num}"
    if doc_id in get_db_ids():
        return 0
    # Add to database
    metadata = {'source': str(file.stem), 'paragraph': paragraph_num}
    doc = Document(page_content=paragraph, metadata=metadata)
    vdb.add_documents([doc], ids=[doc_id])
    # print(f"Added document {doc_id}")
    return 1

In [7]:
def add_book_to_db(file: Path):
    """
    Add a book to the database 
    Returns the number of paragraphs added
    """
    text = file.read_text()
    # Split by paragraphs (empty lines)
    paragraphs = text.split('\n\n')
    print(f"\nReading file '{file}, found {len(paragraphs)} paragraphs")
    count_added = 0
    for i, p in enumerate(paragraphs):
        count_added += add_paragraph_to_vdb(file, i, p)
        if i % 100 == 0:
            print(f"\n{i}\t:", end='', flush=True)
        print('.', end='', flush=True)
    return count_added

In [8]:
# Add all books to the database
for txt_file in DOCS_DIR.glob('*.txt'):
    add_book_to_db(txt_file)


Reading file 'data/books/dracula_chapter_1.txt, found 44 paragraphs

0	:............................................
Reading file 'data/books/hhgttg_chunk_1.txt, found 380 paragraphs

0	:....................................................................................................
100	:....................................................................................................
200	:....................................................................................................
300	:................................................................................