for pdfs

In [4]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
def load_documents():
    document_loader = PyPDFDirectoryLoader(path = "data")
    return document_loader.load()

documents = load_documents()

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

chunks = split_documents(documents)

In [6]:
chunks

[Document(metadata={'source': 'data/monopoly.pdf', 'page': 0}, page_content='MONOPOLY \nProperty Trading Game from Parker Brothers" \nAGES 8+ \n2 to 8 Players \nContents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance \nand Community Chest cards, Title Deed cards, play money and a Banker\'s tray. \nNow there\'s a faster way to play MONOPOLY. Choose to play by \nthe classic rules for buying, renting and selling properties or use the \nSpeed Die to get into the action faster. If you\'ve never played the classic \nMONOPOLY game, refer to the Classic Rules beginning on the next page. \nIf you already know how to play and want to use the Speed Die, just \nread the section below for the additional Speed Die rules. \nSPEED DIE RULES \nLearnins how to Play with the S~eed Die IS as \n/ \nfast as playing with i\'t. \n1. When starting the game, hand out an extra $1,000 to each player'),
 Document(metadata={'source': 'data/monopoly.pdf', 'page': 0}, page_content="1. When starting the game

In [7]:
def calculate_chunk_ids(chunks):

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

chunks_with_ids = calculate_chunk_ids(chunks)


In [8]:
chunks_with_ids

[Document(metadata={'source': 'data/monopoly.pdf', 'page': 0, 'id': 'data/monopoly.pdf:0:0'}, page_content='MONOPOLY \nProperty Trading Game from Parker Brothers" \nAGES 8+ \n2 to 8 Players \nContents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance \nand Community Chest cards, Title Deed cards, play money and a Banker\'s tray. \nNow there\'s a faster way to play MONOPOLY. Choose to play by \nthe classic rules for buying, renting and selling properties or use the \nSpeed Die to get into the action faster. If you\'ve never played the classic \nMONOPOLY game, refer to the Classic Rules beginning on the next page. \nIf you already know how to play and want to use the Speed Die, just \nread the section below for the additional Speed Die rules. \nSPEED DIE RULES \nLearnins how to Play with the S~eed Die IS as \n/ \nfast as playing with i\'t. \n1. When starting the game, hand out an extra $1,000 to each player'),
 Document(metadata={'source': 'data/monopoly.pdf', 'page': 0, 'id': 'da

In [9]:
import chromadb

# Initialize persistent client (data saved to './chroma_db' directory)
persistent_client = chromadb.PersistentClient(path="chromadb")

# Create or get an existing collection
collection = persistent_client.get_or_create_collection(
    name="my_documents",  # Unique collection name
    # metadata={"hnsw:space": "cosine"}  # Optional: Configure similarity metric (cosine, L2, etc.) :cite[2]:cite[8]
)

In [10]:
existing_items = collection.get(include=[])
existing_ids = set(existing_items["ids"])

new_chunks = [chunk for chunk in chunks_with_ids 
    if chunk.metadata["id"] not in existing_ids]

In [11]:
if new_chunks:
    # Prepare data for new chunks only
    documents = [chunk.page_content for chunk in new_chunks]
    metadatas = [chunk.metadata for chunk in new_chunks]
    ids = [chunk.metadata["id"] for chunk in new_chunks]
    
    # Add new documents
    collection.add(
        documents=documents,
        ids=ids,
        metadatas=metadatas
        # Optional: You can also add embeddings here if you have them
    )
    print(f"✅ Added {len(documents)} new documents")
else:
    print("✅ No new documents to add")

✅ Added 23 new documents


In [12]:
collection.peek() 

{'ids': ['data/monopoly.pdf:0:0',
  'data/monopoly.pdf:0:1',
  'data/monopoly.pdf:1:0',
  'data/monopoly.pdf:1:1',
  'data/monopoly.pdf:1:2',
  'data/monopoly.pdf:2:0',
  'data/monopoly.pdf:2:1',
  'data/monopoly.pdf:2:2',
  'data/monopoly.pdf:3:0',
  'data/monopoly.pdf:3:1'],
 'embeddings': array([[ 0.03377642, -0.00334917, -0.04907041, ..., -0.02051336,
         -0.00862594, -0.02566011],
        [ 0.03092418,  0.03485823, -0.05984937, ..., -0.08782715,
         -0.03168297, -0.02584845],
        [ 0.00728211, -0.05288469, -0.01506493, ...,  0.00914375,
         -0.01202721, -0.01705527],
        ...,
        [-0.00777433, -0.0660709 , -0.00465609, ..., -0.03077541,
         -0.01948163, -0.01236132],
        [ 0.05480104,  0.03123317, -0.04032138, ..., -0.08955467,
         -0.06360427, -0.04936159],
        [-0.00829957,  0.05528712,  0.02429235, ..., -0.0560386 ,
         -0.04950954,  0.00133125]]),
 'documents': ['MONOPOLY \nProperty Trading Game from Parker Brothers" \nAGES 8+ 

In [13]:
results = collection.query(
    query_texts=["what happens in the start of monopoly ?"],  # Test query
    n_results=2                    # Number of results to return
)
print(results)

{'ids': [['data/monopoly.pdf:1:0', 'data/monopoly.pdf:7:2']], 'embeddings': None, 'documents': [['Bus: This lets you "get off the bus early." Look at the two white \ndice. You can move the value of one die, the other die, or the \nsum of both dice. So if you rolled a 1 and a 5, you can move \n1 space, 5 spaces, or 6 spaces: \\t\'s your choice. \nMr. Monopoly: First, move the sum of the two white dice \nand resolve the space you land on (such as drawing a card, \nbuying the property, paying rent, etc.). Then, one of two \nthings will happen depending on whether or not there is still \nproperty in the bank. \nYES, there is property in the bank -Advance to the NEXT \nproperty that the bank still holds and buy it if you wish. If you \ndon\'t want to buy this property, move to the space anyway \nand put the property up for auction. \nNO, there are no more properties in the bank - Advance to the', 'lend money to another player. \nL \nWe will be happy to hear your questions or comments about 

# CSV

In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(
    file_path="ICD.csv",
)
data = loader.load()

In [2]:
data

[Document(metadata={'source': 'ICD.csv', 'row': 0}, page_content='Code: A00.-\nDescription: Cholera'),
 Document(metadata={'source': 'ICD.csv', 'row': 1}, page_content='Code: A00.0\nDescription: Cholera durch Vibrio cholerae O:1, Biovar cholerae Klassische Cholera'),
 Document(metadata={'source': 'ICD.csv', 'row': 2}, page_content='Code: A00.1\nDescription: Cholera durch Vibrio cholerae O:1, Biovar eltor El-Tor-Cholera'),
 Document(metadata={'source': 'ICD.csv', 'row': 3}, page_content='Code: A00.9\nDescription: Cholera, nicht näher bezeichnet 11 ICD-10-GM Version 2025'),
 Document(metadata={'source': 'ICD.csv', 'row': 4}, page_content='Code: A01.-\nDescription: Typhus abdominalis und Paratyphus'),
 Document(metadata={'source': 'ICD.csv', 'row': 5}, page_content='Code: A01.0\nDescription: Typhus abdominalis Infektion durch Salmonella typhi Typhoides Fieber'),
 Document(metadata={'source': 'ICD.csv', 'row': 6}, page_content='Code: A01.1\nDescription: Paratyphus A'),
 Document(metadata={

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

csv_chunks = split_documents(data)

In [4]:
csv_chunks

[Document(metadata={'source': 'ICD.csv', 'row': 0}, page_content='Code: A00.-\nDescription: Cholera'),
 Document(metadata={'source': 'ICD.csv', 'row': 1}, page_content='Code: A00.0\nDescription: Cholera durch Vibrio cholerae O:1, Biovar cholerae Klassische Cholera'),
 Document(metadata={'source': 'ICD.csv', 'row': 2}, page_content='Code: A00.1\nDescription: Cholera durch Vibrio cholerae O:1, Biovar eltor El-Tor-Cholera'),
 Document(metadata={'source': 'ICD.csv', 'row': 3}, page_content='Code: A00.9\nDescription: Cholera, nicht näher bezeichnet 11 ICD-10-GM Version 2025'),
 Document(metadata={'source': 'ICD.csv', 'row': 4}, page_content='Code: A01.-\nDescription: Typhus abdominalis und Paratyphus'),
 Document(metadata={'source': 'ICD.csv', 'row': 5}, page_content='Code: A01.0\nDescription: Typhus abdominalis Infektion durch Salmonella typhi Typhoides Fieber'),
 Document(metadata={'source': 'ICD.csv', 'row': 6}, page_content='Code: A01.1\nDescription: Paratyphus A'),
 Document(metadata={

In [5]:
import chromadb

# Initialize persistent client (data saved to './chroma_db' directory)
persistent_client = chromadb.PersistentClient(path="chromadb")

# Create or get an existing collection
collection = persistent_client.get_or_create_collection(
    name="icd_codes",  # Unique collection name
    # metadata={"hnsw:space": "cosine"}  # Optional: Configure similarity metric (cosine, L2, etc.) :cite[2]:cite[8]
)

In [6]:
# Debug: Print metadata of first few chunks
for i, chunk in enumerate(csv_chunks[:3]):
    print(f"Chunk {i} metadata: {chunk.metadata}")

Chunk 0 metadata: {'source': 'ICD.csv', 'row': 0}
Chunk 1 metadata: {'source': 'ICD.csv', 'row': 1}
Chunk 2 metadata: {'source': 'ICD.csv', 'row': 2}


In [7]:
# Add IDs to CSV chunks with unique identifiers
def calculate_csv_chunk_ids(chunks):
    # Keep track of how many times each row appears
    row_counter = {}
    
    for i, chunk in enumerate(chunks):
        source = chunk.metadata.get("source")
        row = chunk.metadata.get("row", i)
        
        # Track how many times we've seen this row
        row_key = f"{source}:row_{row}"
        if row_key in row_counter:
            row_counter[row_key] += 1
            # Add a counter to make the ID unique
            chunk_id = f"{source}:row_{row}_{row_counter[row_key]}"
        else:
            row_counter[row_key] = 0
            chunk_id = f"{source}:row_{row}_0"
        
        # Add it to the metadata
        chunk.metadata["id"] = chunk_id
    
    return chunks

# Apply the improved function to csv_chunks
csv_chunks_with_ids = calculate_csv_chunk_ids(csv_chunks)

# Now check the metadata again
for i, chunk in enumerate(csv_chunks_with_ids[:3]):
    print(f"Chunk {i} metadata after adding IDs: {chunk.metadata}")

# Check for duplicates to verify our solution
ids = [chunk.metadata["id"] for chunk in csv_chunks_with_ids]
unique_ids = set(ids)
print(f"Total IDs: {len(ids)}, Unique IDs: {len(unique_ids)}")
if len(ids) == len(unique_ids):
    print("✅ All IDs are unique!")
else:
    print(f"❌ Found {len(ids) - len(unique_ids)} duplicate IDs")

Chunk 0 metadata after adding IDs: {'source': 'ICD.csv', 'row': 0, 'id': 'ICD.csv:row_0_0'}
Chunk 1 metadata after adding IDs: {'source': 'ICD.csv', 'row': 1, 'id': 'ICD.csv:row_1_0'}
Chunk 2 metadata after adding IDs: {'source': 'ICD.csv', 'row': 2, 'id': 'ICD.csv:row_2_0'}
Total IDs: 10711, Unique IDs: 10711
✅ All IDs are unique!


In [None]:
if csv_chunks_with_ids:
    # Prepare all data
    documents = [chunk.page_content for chunk in csv_chunks_with_ids]
    metadatas = [chunk.metadata for chunk in csv_chunks_with_ids]
    ids = [chunk.metadata["id"] for chunk in csv_chunks_with_ids]
    
    # Process in smaller batches
    batch_size = 100
    total_docs = len(documents)
    
    for i in range(0, total_docs, batch_size):
        end_idx = min(i + batch_size, total_docs)
        print(f"Processing batch {i//batch_size + 1}/{(total_docs+batch_size-1)//batch_size}: documents {i} to {end_idx-1}")
        
        # Add current batch
        collection.add(
            documents=documents[i:end_idx],
            ids=ids[i:end_idx],
            metadatas=metadatas[i:end_idx]
        )
    
    print(f"✅ Added {total_docs} documents in batches")
else:
    print("✅ No new documents to add")

Processing batch 1/108: documents 0 to 99


In [None]:
collection.peek() 

In [None]:
results = collection.query(
    query_texts=["cholera topica"],  # Test query
    n_results=2                    # Number of results to return
)
print(results)