In [5]:
import os
import glob
import json
import time
import pandas as pd

from dotenv import load_dotenv

from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData

from langchain.schema import Document
from chromadb import PersistentClient
import chromadb
from chromadb.config import Settings
from langchain_huggingface import HuggingFaceEmbeddings


In [6]:

load_dotenv()

DATABASE_URL = os.getenv("DATABASE_URL")


In [7]:
csv_folder = "legal_clause"

In [8]:
def load_csv_documents(csv_folder):
    documents = []
    for filepath in glob.glob(os.path.join(csv_folder, "*.csv")):
        df = pd.read_csv(filepath)
        file_name = os.path.basename(filepath)

        for idx, row in df.iterrows():
            content = "\n".join([f"{col}: {val}" for col, val in row.items()])
            metadata = {"source_file": file_name, "row_index": idx}
            documents.append(Document(page_content=content, metadata=metadata))

    print(f"✅ Loaded {len(documents)} documents from CSV files.")
    return documents

In [9]:
def save_documents(documents, folder="saved_docs"):
    os.makedirs(folder, exist_ok=True)

    with open(os.path.join(folder, "documents.json"), "w", encoding="utf-8") as f:
        json.dump(
            [{"content": doc.page_content, "metadata": doc.metadata} for doc in documents],
            f, indent=2
        )

    print(f"💾 Saved {len(documents)} documents to '{folder}/documents.json'")

In [10]:
def load_saved_documents(folder="saved_docs"):
    with open(os.path.join(folder, "documents.json"), "r", encoding="utf-8") as f:
        doc_data = json.load(f)

    documents = [Document(page_content=d["content"], metadata=d["metadata"]) for d in doc_data]
    print(f"✅ Loaded {len(documents)} documents from '{folder}/documents.json'")
    return documents

In [11]:
if not os.path.exists("saved_docs/documents.json"):
    documents = load_csv_documents(csv_folder)
    save_documents(documents)
else:
    documents = load_saved_documents()

✅ Loaded 150881 documents from 'saved_docs/documents.json'


In [12]:
engine = create_engine(DATABASE_URL)
metadata_db = MetaData()

In [13]:
documents_table = Table(
    'legal_docs_metadata',
    metadata_db,
    Column('id', Integer, primary_key=True, autoincrement=True),
    Column('document_id', String, nullable=False),
    Column('chunk_text', String, nullable=False),
    Column('source', String, nullable=True),
    Column('row_index', Integer, nullable=True),
)

In [14]:
metadata_db.create_all(engine)

In [15]:

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

sample_vector = embedding_model.embed_query("Clause regarding employment period")
print(f"✅ Sample embedding vector size: {len(sample_vector)}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Sample embedding vector size: 768


In [16]:
# Initialize ChromaDB (Persistent)
persist_dir = "legal_chroma_db"
collection_name = "legal_index"

chroma_client = PersistentClient(path="legal_chroma_db")

vectorstore = chroma_client.get_or_create_collection(name="legal_index")


if collection_name in [c.name for c in chroma_client.list_collections()]:
    vectorstore = chroma_client.get_collection(name=collection_name)
    print(f"✅ Loaded existing ChromaDB collection.")
else:
    vectorstore = chroma_client.create_collection(name=collection_name)
    print(f"✅ Created new ChromaDB collection.")

✅ Loaded existing ChromaDB collection.


In [17]:
BATCH_SIZE = 50
RETRY_LIMIT = 3
SLEEP_BETWEEN_RETRIES = 5  # seconds
CHECKPOINT_FILE = "embeddings_checkpoint.json"
VECTORSTORE_FOLDER = "chroma_vectorstore"

In [18]:
def save_checkpoint(processed_batches):
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump({"processed_batches": list(processed_batches)}, f)


def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r") as f:
            data = json.load(f)
            return set(data.get("processed_batches", []))  # Convert back to set
    return set()


# ✅ Batch Embedding and Storage with Metadata (using HuggingFaceEmbeddings)
def store_documents_in_batches(documents, source_file, embedding_model, vectorstore, chroma_client,
                               batch_size=BATCH_SIZE):
    total_docs = len(documents)
    processed_batches = load_checkpoint()
    successfully_stored_ids = []

    print(f"\n🚀 Starting embedding of {total_docs} documents in batches of {batch_size}...\n")

    for start_idx in range(0, total_docs, batch_size):
        batch_number = start_idx // batch_size

        if batch_number in processed_batches:
            print(f"⏩ Skipping already processed batch {batch_number}")
            continue

        batch_docs = documents[start_idx: start_idx + batch_size]
        batch_texts = [doc.page_content for doc in batch_docs]
        batch_metadatas = [doc.metadata for doc in batch_docs]
        batch_ids = [f"{source_file}_{start_idx + i}" for i in range(len(batch_docs))]

        success = False
        retry_count = 0

        while not success and retry_count < RETRY_LIMIT:
            try:
                start_time = time.time()

                # Embed documents
                batch_embeddings = embedding_model.embed_documents(batch_texts)

                # Store embeddings, metadata, and texts in vectorstore
                vectorstore.add(
                    ids=batch_ids,
                    embeddings=batch_embeddings,
                    documents=batch_texts,
                    metadatas=batch_metadatas
                )

                processed_batches.add(batch_number)
                save_checkpoint(processed_batches)

                successfully_stored_ids.extend(batch_ids)

                

                elapsed_time = round(time.time() - start_time, 2)
                print(
                    f"✅ Processed batch {batch_number} ({start_idx} to {start_idx + len(batch_docs) - 1}) in {elapsed_time} sec")

                success = True

            except Exception as e:
                retry_count += 1
                print(f"⚠️ Failed batch {batch_number} ({retry_count}/{RETRY_LIMIT} retries): {e}")
                time.sleep(SLEEP_BETWEEN_RETRIES)

        if not success:
            print(f"❌ Giving up on batch {batch_number} after {RETRY_LIMIT} retries.")

    chroma_client.persist()
    print("\n🎉 All documents embedded and stored successfully.")

    return {
        "processed_ids": successfully_stored_ids,
        "total_processed": len(successfully_stored_ids)
    }


result = store_documents_in_batches(
    documents=documents,
    source_file="legal_dataset",
    embedding_model=embedding_model,
    vectorstore=vectorstore,
    chroma_client=chroma_client
)

print(result["processed_ids"])  # List of successfully stored document IDs
print(result["total_processed"])  # Total count



🚀 Starting embedding of 150881 documents in batches of 50...

⏩ Skipping already processed batch 0
⏩ Skipping already processed batch 1
⏩ Skipping already processed batch 2
⏩ Skipping already processed batch 3
⏩ Skipping already processed batch 4
⏩ Skipping already processed batch 5
⏩ Skipping already processed batch 6
⏩ Skipping already processed batch 7
⏩ Skipping already processed batch 8
⏩ Skipping already processed batch 9
⏩ Skipping already processed batch 10
⏩ Skipping already processed batch 11
⏩ Skipping already processed batch 12
⏩ Skipping already processed batch 13
⏩ Skipping already processed batch 14
⏩ Skipping already processed batch 15
⏩ Skipping already processed batch 16
⏩ Skipping already processed batch 17
⏩ Skipping already processed batch 18
⏩ Skipping already processed batch 19
⏩ Skipping already processed batch 20
⏩ Skipping already processed batch 21
⏩ Skipping already processed batch 22
⏩ Skipping already processed batch 23
⏩ Skipping already processed batch 

AttributeError: 'Client' object has no attribute 'persist'

In [19]:
vector_collection = chroma_client.get_collection(name = "legal_index")
print(vector_collection)

Collection(name=legal_index)


In [20]:
from chromadb import Client
from chromadb.config import Settings

# Set the path where your Chroma DB is stored
persist_directory = "legal_chroma_db"

# Create a client connected to the persisted DB
client = Client(Settings(persist_directory=persist_directory, chroma_db_impl="duckdb+parquet"))

# List all collections
collections = client.list_collections()

for col in collections:
    print(col.name)


ValueError: [91mYou are using a deprecated configuration of Chroma.

[94mIf you do not have data you wish to migrate, you only need to change how you construct
your Chroma client. Please see the "New Clients" section of https://docs.trychroma.com/deployment/migration.
________________________________________________________________________________________________

If you do have data you wish to migrate, we have a migration tool you can use in order to
migrate your data to the new Chroma architecture.
Please `pip install chroma-migrate` and run `chroma-migrate` to migrate your data and then
change how you construct your Chroma client.

See https://docs.trychroma.com/deployment/migration for more information or join our discord at https://discord.gg/MMeYNTmh3x for help![0m

In [23]:
import chromadb
from chromadb.config import Settings

# Path to your persisted Chroma DB directory
PERSIST_DIR = "legal_chroma_db"

# Initialize Chroma client
client = chromadb.PersistentClient(path=PERSIST_DIR)

# List all collections
collections = client.list_collections()

# Print collection names
print("📚 Collections in ChromaDB:")
for col in collections:
    print(f"🔸 {col.name}")


📚 Collections in ChromaDB:
🔸 user_a9061205-42ca-4055-9abb-24996f3d30a3
🔸 legal_index
🔸 user_9999
