for pdfs

In [3]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
def load_documents():
    document_loader = PyPDFDirectoryLoader(path = "data")
    return document_loader.load()

In [4]:
documents = load_documents()

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [6]:
chunks = split_documents(documents)

In [7]:
chunks

[Document(metadata={'source': 'data/monopoly.pdf', 'page': 0}, page_content='MONOPOLY \nProperty Trading Game from Parker Brothers" \nAGES 8+ \n2 to 8 Players \nContents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance \nand Community Chest cards, Title Deed cards, play money and a Banker\'s tray. \nNow there\'s a faster way to play MONOPOLY. Choose to play by \nthe classic rules for buying, renting and selling properties or use the \nSpeed Die to get into the action faster. If you\'ve never played the classic \nMONOPOLY game, refer to the Classic Rules beginning on the next page. \nIf you already know how to play and want to use the Speed Die, just \nread the section below for the additional Speed Die rules. \nSPEED DIE RULES \nLearnins how to Play with the S~eed Die IS as \n/ \nfast as playing with i\'t. \n1. When starting the game, hand out an extra $1,000 to each player'),
 Document(metadata={'source': 'data/monopoly.pdf', 'page': 0}, page_content="1. When starting the game

In [8]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks


In [9]:
chunks_with_ids = calculate_chunk_ids(chunks)
chunks_with_ids

[Document(metadata={'source': 'data/monopoly.pdf', 'page': 0, 'id': 'data/monopoly.pdf:0:0'}, page_content='MONOPOLY \nProperty Trading Game from Parker Brothers" \nAGES 8+ \n2 to 8 Players \nContents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance \nand Community Chest cards, Title Deed cards, play money and a Banker\'s tray. \nNow there\'s a faster way to play MONOPOLY. Choose to play by \nthe classic rules for buying, renting and selling properties or use the \nSpeed Die to get into the action faster. If you\'ve never played the classic \nMONOPOLY game, refer to the Classic Rules beginning on the next page. \nIf you already know how to play and want to use the Speed Die, just \nread the section below for the additional Speed Die rules. \nSPEED DIE RULES \nLearnins how to Play with the S~eed Die IS as \n/ \nfast as playing with i\'t. \n1. When starting the game, hand out an extra $1,000 to each player'),
 Document(metadata={'source': 'data/monopoly.pdf', 'page': 0, 'id': 'da

In [14]:
import chromadb

persistent_client = chromadb.PersistentClient(path="chromadb")
collections = persistent_client.list_collections()
print(f"Available collections: {[c.name for c in collections]}")

Available collections: []


In [25]:
# Check if collections exist
if collections:
    collection = persistent_client.get_collection(name=collections[0].name)
    
    # Get existing IDs
    existing_items = collection.get(include=[])
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")
    
    # Filter out chunks that already exist
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)
    
    if len(new_chunks):
        print(f"üëâ Adding new documents: {len(new_chunks)}")
        
        # Prepare data in format expected by ChromaDB
        documents = [chunk.page_content for chunk in new_chunks]
        metadatas = [chunk.metadata for chunk in new_chunks]
        ids = [chunk.metadata["id"] for chunk in new_chunks]
        
        # Add to collection
        collection.add(
            documents=documents,
            metadatas=metadatas,
            ids=ids
        )
        print("‚úÖ Documents added successfully")
    else:
        print("‚úÖ No new documents to add")
else:
    # Create a new collection
    print("No collections found. Creating a new collection...")
    collection = persistent_client.create_collection(name="documents")
    
    # All chunks are new since this is a new collection
    print(f"üëâ Adding {len(chunks_with_ids)} documents to new collection")
    
    # Prepare data in format expected by ChromaDB
    documents = [chunk.page_content for chunk in chunks_with_ids]
    metadatas = [chunk.metadata for chunk in chunks_with_ids]
    ids = [chunk.metadata["id"] for chunk in chunks_with_ids]
    
    # Add to collection
    collection.add(
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )
    print("‚úÖ New collection created and documents added successfully")

No collections found. Creating a new collection...
üëâ Adding 23 documents to new collection
‚úÖ New collection created and documents added successfully
‚úÖ New collection created and documents added successfully


In [30]:
collection.peek() 

{'ids': ['data/monopoly.pdf:0:0',
  'data/monopoly.pdf:0:1',
  'data/monopoly.pdf:1:0',
  'data/monopoly.pdf:1:1',
  'data/monopoly.pdf:1:2',
  'data/monopoly.pdf:2:0',
  'data/monopoly.pdf:2:1',
  'data/monopoly.pdf:2:2',
  'data/monopoly.pdf:3:0',
  'data/monopoly.pdf:3:1'],
 'embeddings': array([[ 0.03377642, -0.00334917, -0.04907041, ..., -0.02051336,
         -0.00862594, -0.02566011],
        [ 0.03092418,  0.03485823, -0.05984937, ..., -0.08782715,
         -0.03168297, -0.02584845],
        [ 0.00728211, -0.05288469, -0.01506493, ...,  0.00914375,
         -0.01202721, -0.01705527],
        ...,
        [-0.00777433, -0.0660709 , -0.00465609, ..., -0.03077541,
         -0.01948163, -0.01236132],
        [ 0.05480104,  0.03123317, -0.04032138, ..., -0.08955467,
         -0.06360427, -0.04936159],
        [-0.00829957,  0.05528712,  0.02429235, ..., -0.0560386 ,
         -0.04950954,  0.00133125]]),
 'documents': ['MONOPOLY \nProperty Trading Game from Parker Brothers" \nAGES 8+ 

In [None]:
from langchain_community.vectorstores import Chroma

# Create a persistent database
vector_store = Chroma.from_texts(
    texts=chunk_texts,
    embedding=embeddings_model,
    metadatas=[chunk.metadata for chunk in chunks],  # Includes source, page, etc.
    ids=[generate_chunk_id(...) for chunk in chunks],  # Unique IDs
    persist_directory="./chroma_db"  # Save to disk
)

# CSV

In [36]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(
    file_path="ICD.csv",
)
data = loader.load()

In [37]:
data

[Document(metadata={'source': 'ICD.csv', 'row': 0}, page_content='Code: A00.-\nDescription: Cholera'),
 Document(metadata={'source': 'ICD.csv', 'row': 1}, page_content='Code: A00.0\nDescription: Cholera durch Vibrio cholerae O:1, Biovar cholerae Klassische Cholera'),
 Document(metadata={'source': 'ICD.csv', 'row': 2}, page_content='Code: A00.1\nDescription: Cholera durch Vibrio cholerae O:1, Biovar eltor El-Tor-Cholera'),
 Document(metadata={'source': 'ICD.csv', 'row': 3}, page_content='Code: A00.9\nDescription: Cholera, nicht n√§her bezeichnet 11 ICD-10-GM Version 2025'),
 Document(metadata={'source': 'ICD.csv', 'row': 4}, page_content='Code: A01.-\nDescription: Typhus abdominalis und Paratyphus'),
 Document(metadata={'source': 'ICD.csv', 'row': 5}, page_content='Code: A01.0\nDescription: Typhus abdominalis Infektion durch Salmonella typhi Typhoides Fieber'),
 Document(metadata={'source': 'ICD.csv', 'row': 6}, page_content='Code: A01.1\nDescription: Paratyphus A'),
 Document(metadata=

In [None]:
import chromadb

persistent_client = chromadb.PersistentClient(path="chromadb")