In [8]:
import chromadb
chroma_client = chromadb.Client()

In [2]:
collection = chroma_client.create_collection(name="my_collection")

In [3]:
collection.add(
    ids=["id1", "id2"],
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ]
)

In [5]:
results = collection.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(results)

{'ids': [['id1', 'id2']], 'embeddings': None, 'documents': [['This is a document about pineapple', 'This is a document about oranges']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None]], 'distances': [[1.0404008626937866, 1.2430801391601562]]}


In [36]:
chroma_http_client = chromadb.HttpClient(host='localhost', port="8000")

In [40]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [45]:
from pathlib import Path
from langchain.schema import Document

# Load product documents created in ./products_helioforge
doc_dir = Path('products_helioforge')
documents = []
for p in sorted(doc_dir.glob('*.txt')):
    text = p.read_text(encoding='utf-8', errors='replace')
    documents.append(Document(page_content=text, metadata={'source': str(p)}))

# Split into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=100,
    length_function=len,
)
chunks = splitter.split_documents(documents)
chunks

[Document(metadata={'source': 'products_helioforge/helioforge_atlas.txt'}, page_content='HelioForge Atlas — Unified Energy Management Platform'),
 Document(metadata={'source': 'products_helioforge/helioforge_atlas.txt'}, page_content='Company background'),
 Document(metadata={'source': 'products_helioforge/helioforge_atlas.txt'}, page_content='HelioForge is a forward-thinking energy-technology company that focuses on integrated, resilient energy solutions for mid-sized commercial operations. The company’s mission is to simplify energy intelligence: making clean generation, storage, monitoring, and optimization work together as one'),
 Document(metadata={'source': 'products_helioforge/helioforge_atlas.txt'}, page_content='intelligence: making clean generation, storage, monitoring, and optimization work together as one cohesive system. The HelioForge product family is built around three principles: observability, autonomy, and adaptability.'),
 Document(metadata={'source': 'products_heli

In [37]:
chroma_http_client.list_collections()

[Collection(name=my_collection)]

In [38]:
collection = chroma_http_client.get_collection('my_collection')

In [39]:
collection.count()

258

In [10]:
collection.get('id1')

{'ids': ['id1'],
 'embeddings': None,
 'metadatas': [None],
 'documents': ['hello world'],
 'data': None,
 'uris': None,
 'included': ['metadatas', 'documents']}

In [2]:
import csv

In [22]:
import re

documents, metadatas, ids = [], [], []
thousands_pattern = re.compile(r'^-?\d{1,3}(?:,\d{3})+(?:\.\d+)?$')
paren_thousands_pattern = re.compile(r'^\(\d{1,3}(?:,\d{3})+(?:\.\d+)?\)$')

with open('T12.csv', 'r', encoding='utf-8', errors='replace', newline='') as t12:
    reader = csv.reader(t12)
    for i, row in enumerate(reader):
        cleaned_row = []
        for col in row:
            val = col.strip()
            if thousands_pattern.match(val):
                val = val.replace(',', '')
            elif paren_thousands_pattern.match(val):
                val = '(' + val[1:-1].replace(',', '') + ')'
            cleaned_row.append(val)
        documents.append(','.join(cleaned_row))
        metadatas.append({'row': i + 1, 'first_element': row[0]})
        ids.append(str(i + 1))

In [23]:
documents

['Esper,Espe,r,,,,,,,,,,,,,',
 'New Earth Residential,New,Earth Residential,,,,,,,,,,,,,',
 'Trailing Profit And Loss Detail,Trai,ling Profit And Loss Detail,,,,,,,,,,,,,',
 'August 2025 - Accrual - Accounting Book: Default,Augu,st 2025 - Accrual - Accounting Book: Default,,,,,,,,,,,,,',
 ',,,,,,,,,,,,,,,',
 'Printed 9/23/2025 2:58:13 PM,Prin,ted 9/23/2025 2:58:13 PM,,,,,,,,,,,,,',
 'Account,Acco,unt,Sep 2024 Actual,Oct 2024 Actual,Nov 2024 Actual,Dec 2024 Actual,Jan 2025 Actual,Feb 2025 Actual,Mar 2025 Actual,Apr 2025 Actual,May 2025 Actual,Jun 2025 Actual,Jul 2025 Actual,Aug 2025 Actual,Adjusted Total',
 'INCOME,INCO,ME,,,,,,,,,,,,,',
 'Rental Income,Rent,al Income,,,,,,,,,,,,,',
 'Gross Potential Rent,Gros,s Potential Rent,,,,,,,,,,,,,',
 '4110 Market Rent,4110,Market Rent,535295.67,531920.00,521025.00,516165.00,518355.00,517855.00,517200.00,528415.00,554965.00,557420.00,530255.00,526860.00,6355730.67',
 '4120 Loss to Old Lease,4120,Loss to Old Lease,(12922.03),(9871.51),1699.55,643

In [24]:
collection.delete(ids=ids)

In [25]:
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids,
)

In [35]:
results = collection.query(
    query_texts=['jan feb mar apr may jun jul aug sep oct nov dec', 'net income', 'net operating income', 'non operating'],
    n_results=1,
)
for document in results['documents']:
    print(document[0])

Account,Acco,unt,Sep 2024 Actual,Oct 2024 Actual,Nov 2024 Actual,Dec 2024 Actual,Jan 2025 Actual,Feb 2025 Actual,Mar 2025 Actual,Apr 2025 Actual,May 2025 Actual,Jun 2025 Actual,Jul 2025 Actual,Aug 2025 Actual,Adjusted Total
NET INCOME,NET,NETINCOME,(1285.97),(15196.69),17809.41,(38248.16),(91651.82),(11579.60),(90357.50),(1413.66),23364.66,(75790.62),(104356.83),(123271.94),(511978.72)
NET OPERATING INCOME,NET,OPERATING INCOME,377758.94,369457.50,385769.45,383657.01,367121.56,397663.41,383116.53,406426.06,418280.49,391697.97,373216.07,354586.73,4608751.72
Other Non-Operating,Othe,Other Non-Operating,,,,,,,,,,,,,
