## Required Imports

In [8]:
from langchain.document_loaders import UnstructuredFileLoader
from unstructured.cleaners.core import clean_extra_whitespace
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Setup the UnstructuredFileLoader to process a HTML file

In [9]:
loader = UnstructuredFileLoader(
    file_path="data/user/cb-smarter-building-dhc-service-3.html",
    strategy="hi-res",  # other option:"fast"
    mode="single",  # single (default), elements, paged (for PDFs)
    post_processors=[clean_extra_whitespace],
)

docs = loader.load()

## Iterate through the HTML file and extract the data

In [10]:
for doc in docs:
    print(doc)

page_content='Crossbreed Smarter Building DHC Service\n\nMario Toffia\n\nmario.toffia@crossbreed.se\n\nTable of Contents\n\nAbstract\n\nCapabilities Energy efficiency Operation of heating system Calculation\n\nProcesses Communication of Data-point from Controller Resolution Settings Algorithm\n\nSensors\n\nOnboarding\n\nColophon\n\nThis chapter outlines Crossbreed Smarter Building DHC [1] Service and the capabilities and processes connected to it.\n\nAbstract\n\nCrossbreed Smarter Building DHC, is an OEM service for district heating equipment providers offered to their customers, the property owners or energy operators, enabling energy optimization and monitoring of district heating systems in buildings.\n\nThanks to a combination of different securely integrated data sources and embedded AI algorithms, the Service supports the heating system to reduce energy usage in buildings without renouncing comfort and indoor climate.\n\nParallel to lowering the energy costs for building owners a

## Use Recursive Character Splitter for HTML docs

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)
docs = text_splitter.split_documents(docs)

for doc in docs:
    print(doc)

page_content='Crossbreed Smarter Building DHC Service\n\nMario Toffia\n\nmario.toffia@crossbreed.se\n\nTable of Contents\n\nAbstract\n\nCapabilities Energy efficiency Operation of heating system Calculation\n\nProcesses Communication of Data-point from Controller Resolution Settings Algorithm\n\nSensors\n\nOnboarding\n\nColophon' metadata={'source': 'data/user/cb-smarter-building-dhc-service-3.html'}
page_content='Sensors\n\nOnboarding\n\nColophon\n\nThis chapter outlines Crossbreed Smarter Building DHC [1] Service and the capabilities and processes connected to it.\n\nAbstract' metadata={'source': 'data/user/cb-smarter-building-dhc-service-3.html'}
page_content='Abstract\n\nCrossbreed Smarter Building DHC, is an OEM service for district heating equipment providers offered to their customers, the property owners or energy operators, enabling energy optimization and monitoring of district heating systems in buildings.' metadata={'source': 'data/user/cb-smarter-building-dhc-service-3.htm

## Load the docs into chroma db

In [None]:
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings

#embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
#vector_store = Chroma(embeddings=embeddings)
#
#vector_store.from_texts(
#            texts=texts,
#            metadatas=metadata,
#            persist_directory=self.embeddings_path,
#            embedding=self.embeddings,
#        )