In [1]:
import os
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader

# Initialize SentenceTransformer model for embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# ChromaDB setup
from chromadb.config import Settings
import chromadb

client = chromadb.Client(Settings(
    persist_directory="./chroma_db",  # Directory for persisting data
))

#client.delete_collection(name="mdd_documents")
collection = client.create_collection("mdd_documents")

# Load and preprocess documents
def preprocess_documents(directory):
    """
    Preprocesses text documents from a given directory and stores their embeddings.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    print(directory)
    for filename in os.listdir(directory):
        print(filename)
        if filename.endswith(".pdf"):
            print(f"Processing: {filename}")
            try:
                # Use PyPDF2 to extract text
                pdf_reader = PdfReader(os.path.join(directory, filename))
                content = ""
                for page in pdf_reader.pages:
                    content += page.extract_text()
                
                # Check if content was successfully extracted
                if not content.strip():
                    print(f"Warning: No text found in {filename}")
                    continue

                # Split text into smaller chunks
                chunks = text_splitter.split_text(content)
                
                # Embed each chunk
                embeddings = embedding_model.encode(chunks)
                # Add to ChromaDB
                for i, chunk in enumerate(chunks):
                    collection.add(
                        documents=[chunk],
                        metadatas=[{"source": filename}],
                        ids=[f"{filename}_{i}"],
                        embeddings=embedding_model.encode(chunk)
                    )
            except Exception as e:
                print(f"Error processing {filename}: {e}")
    print("Documents have been processed and stored in ChromaDB!")

# Run preprocessing (adjust path to your document folder)
if __name__ == "__main__":
    preprocess_documents("./docs")

./docs
1460210.pdf
Processing: 1460210.pdf
6_irp681.pdf
Processing: 6_irp681.pdf
adatis-azure-national-archives.pdf
Processing: adatis-azure-national-archives.pdf
DSML.pdf
Processing: DSML.pdf
Machine - Learning - Tom Mitchell.pdf
Processing: Machine - Learning - Tom Mitchell.pdf
Machine Learning For Absolute Beginners.pdf
Processing: Machine Learning For Absolute Beginners.pdf
MACHINE LEARNING(R17A0534).pdf
Processing: MACHINE LEARNING(R17A0534).pdf
Paper_30-Machine_Learning_Algorithms.pdf
Processing: Paper_30-Machine_Learning_Algorithms.pdf
thebook.pdf
Processing: thebook.pdf
Documents have been processed and stored in ChromaDB!


In [2]:
collection.get(ids=['1460210.pdf_1'])

{'ids': ['1460210.pdf_1'],
 'embeddings': None,
 'documents': ['Many researchers try to collect and extract this information in large enough quantities that it requires machine automation. But because\npublications were historically intended for print and not machine consumption, the digital document formats used today (primarily PDF)\nhave created many hurdles for information extraction. Primarily, tools have relied on trying to convert PDF documents to plain text for'],
 'uris': None,
 'data': None,
 'metadatas': [{'source': '1460210.pdf'}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [5]:
chunks = [
    "DeepPDF: A Deep Learning Approach to Analyzing PDFs\nChristopher Stahl..."
]
embeddings = embedding_model.encode(chunks)
embeddings

array([[-7.50970319e-02,  1.17351096e-02, -1.14841014e-01,
         1.40069714e-02,  4.32890505e-02,  2.28892118e-02,
        -6.41679838e-02, -7.31650889e-02, -1.19264843e-02,
        -6.23673499e-02,  3.05247214e-02,  2.01456770e-02,
        -2.18213648e-02,  6.75240010e-02, -8.25160220e-02,
        -7.40731508e-02, -6.08922727e-03, -9.28646885e-03,
        -1.54298842e-02,  1.04545774e-02, -6.35114834e-02,
         9.47635993e-03,  5.42435683e-02, -6.09579980e-02,
         2.45148409e-02,  1.47313951e-02, -8.56740922e-02,
        -1.95289366e-02, -4.76937816e-02, -6.65004775e-02,
         1.32352754e-01,  4.07528691e-02, -2.03969260e-03,
         1.81031320e-02,  1.06654368e-01,  4.04617004e-02,
        -4.40376885e-02, -3.45940068e-02,  1.14347503e-01,
         1.52585702e-03,  2.62608845e-02, -7.81196579e-02,
         1.84115674e-02,  2.65086908e-02,  1.44533589e-02,
        -4.39289771e-02, -2.26655807e-02,  1.95012018e-02,
         4.86139730e-02,  6.80080876e-02, -1.99042354e-0

In [9]:
collection.add(
    documents=chunks,
    metadatas=[{"source": "1460210.pdf"}],
    ids=["1460210.pd_0"],
    embeddings=embeddings  # Ensure this parameter is not None
)

In [10]:
collection.get(ids=['1460210.pd_0'])

{'ids': ['1460210.pd_0'],
 'embeddings': None,
 'documents': ['DeepPDF: A Deep Learning Approach to Analyzing PDFs\nChristopher Stahl...'],
 'uris': None,
 'data': None,
 'metadatas': [{'source': '1460210.pdf'}],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [11]:
client = chromadb.Client(Settings(persist_directory="./chroma_db"))

In [12]:
from chromadb.config import Settings

client = chromadb.Client(Settings(
    persist_directory="./chroma_db"  # Ensure this path is correct
))

In [13]:
results = collection.get()

if results['embeddings']:
    print("Embeddings successfully stored!")
    print("Embeddings:", results['embeddings'])
else:
    print("No embeddings found. Check the document addition process.")

No embeddings found. Check the document addition process.


In [19]:
client.delete_collection(name="mdd_documents1")
collection = client.create_collection("mdd_documents1")

In [11]:
chunks = [
    "DeepPDF: A Deep Learning Approach to Analyzing PDFs\nChristopher Stahl..."
]
embeddings = embedding_model.encode(chunks)
print(embeddings)  # Should output a numerical vector for each chunk

[[-7.50970319e-02  1.17351096e-02 -1.14841014e-01  1.40069714e-02
   4.32890505e-02  2.28892118e-02 -6.41679838e-02 -7.31650889e-02
  -1.19264843e-02 -6.23673499e-02  3.05247214e-02  2.01456770e-02
  -2.18213648e-02  6.75240010e-02 -8.25160220e-02 -7.40731508e-02
  -6.08922727e-03 -9.28646885e-03 -1.54298842e-02  1.04545774e-02
  -6.35114834e-02  9.47635993e-03  5.42435683e-02 -6.09579980e-02
   2.45148409e-02  1.47313951e-02 -8.56740922e-02 -1.95289366e-02
  -4.76937816e-02 -6.65004775e-02  1.32352754e-01  4.07528691e-02
  -2.03969260e-03  1.81031320e-02  1.06654368e-01  4.04617004e-02
  -4.40376885e-02 -3.45940068e-02  1.14347503e-01  1.52585702e-03
   2.62608845e-02 -7.81196579e-02  1.84115674e-02  2.65086908e-02
   1.44533589e-02 -4.39289771e-02 -2.26655807e-02  1.95012018e-02
   4.86139730e-02  6.80080876e-02 -1.99042354e-02  4.41402234e-02
  -3.42644081e-02  3.85189541e-02  1.28648520e-01 -3.05237416e-02
   7.30262324e-02  4.50023450e-03 -6.15194887e-02 -6.66502938e-02
  -2.16215

In [14]:
results = collection.get(ids=["1460210.pdf_0"])
print("Embeddings:", results['embeddings'])

Embeddings: None


In [15]:
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
import chromadb

# Initialize ChromaDB client
client = chromadb.Client(Settings(persist_directory="./chroma_db"))
#client.delete_collection("mdd_documents")
collection = client.create_collection("mdd_documents")

# Initialize embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Sample chunks
chunks = ["This is a sample text chunk.", "Another example text chunk."]

# Generate embeddings
embeddings = embedding_model.encode(chunks)
print("Generated Embeddings:", embeddings)

# Add data to ChromaDB
try:
    collection.add(
        documents=chunks,
        metadatas=[{"source": "example"}] * len(chunks),
        ids=[f"chunk_{i}" for i in range(len(chunks))],
        embeddings=embeddings
    )
    print("Data added successfully!")
except Exception as e:
    print("Error during collection.add:", e)

# Verify data storage
results = collection.get()
print("Stored Documents:", results['documents'])
print("Stored Embeddings:", results['embeddings'])

UniqueConstraintError: Collection mdd_documents already exists

In [16]:
import numpy as np

# Ensure embeddings are in the correct format
if isinstance(embeddings, np.ndarray):
    embeddings = embeddings.tolist()

print("Embeddings after conversion:", embeddings)

Embeddings after conversion: [[-0.07509703189134598, 0.011735109612345695, -0.11484101414680481, 0.014006971381604671, 0.043289050459861755, 0.022889211773872375, -0.06416798382997513, -0.07316508889198303, -0.011926484294235706, -0.06236734986305237, 0.0305247213691473, 0.020145677030086517, -0.021821364760398865, 0.0675240010023117, -0.08251602202653885, -0.07407315075397491, -0.006089227274060249, -0.009286468848586082, -0.015429884195327759, 0.010454577393829823, -0.06351148337125778, 0.00947635993361473, 0.05424356833100319, -0.06095799803733826, 0.02451484091579914, 0.014731395058333874, -0.08567409217357635, -0.019528936594724655, -0.047693781554698944, -0.0665004774928093, 0.13235275447368622, 0.040752869099378586, -0.002039692597463727, 0.018103132024407387, 0.10665436834096909, 0.0404617004096508, -0.04403768852353096, -0.03459400683641434, 0.11434750258922577, 0.0015258570201694965, 0.02626088447868824, -0.078119657933712, 0.01841156743466854, 0.026508690789341927, 0.0144533

In [17]:
print("Number of documents:", len(chunks))
print("Number of embeddings:", len(embeddings))

if len(chunks) != len(embeddings):
    print("Mismatch between documents and embeddings!")

Number of documents: 1
Number of embeddings: 1


In [18]:
try:
    collection.add(
        documents=chunks,
        metadatas=[{"source": "example"}] * len(chunks),
        ids=[f"chunk_{i}" for i in range(len(chunks))],
        embeddings=embeddings
    )
    print("Data added successfully!")
except Exception as e:
    print("Error during collection.add:", e)

Data added successfully!


In [21]:
collection.add(
    ids=['ddd'],
    documents=chunks,
    embeddings=embeddings
)

In [24]:
results = collection.get(ids=['ddd'])

In [25]:
results

{'ids': ['ddd'],
 'embeddings': None,
 'documents': ['DeepPDF: A Deep Learning Approach to Analyzing PDFs\nChristopher Stahl...'],
 'uris': None,
 'data': None,
 'metadatas': [None],
 'included': [<IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [3]:
import tempfile
from chromadb.config import Settings
import chromadb

# Create a temporary directory for ephemeral storage
temp_dir = tempfile.TemporaryDirectory()

# Initialize ChromaDB client
client = chromadb.Client (Settings(allow_reset=True))

# Use ChromaDB (add collections, documents, etc.)
collection = client.create_collection("mdd_documents")

# Clean up the temporary directory when done

# Add documents and embeddings
collection.add(
    documents=["Example document 1", "Example document 2"],
    metadatas=[{"source": "example"}] * 2,
    ids=["doc1", "doc2"],
    embeddings=[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
)

# Verify stored data
results = collection.get()
print("Documents:", results['documents'])
print("Embeddings:", results['embeddings'])

ValueError: An instance of Chroma already exists for ephemeral with different settings

In [2]:
client.reset()

ValueError: Resetting is not allowed by this configuration (to enable it, set `allow_reset` to `True` in your Settings() or include `ALLOW_RESET=TRUE` in your environment variables)