## Links

https://medium.com/@onkarmishra/using-langchain-for-question-answering-on-own-data-3af0a82789ed
https://github.com/langchain-ai/langchain/tree/master/templates
https://unstructured-io.github.io/unstructured/core/embedding.html
https://github.com/langchain-ai/langchain/blob/master/templates/rag-conversation/rag_conversation/chain.py
https://python.langchain.com/docs/modules/data_connection/retrievers/contextual_compression/
https://python.langchain.com/docs/expression_language/cookbook/retrieval

## Required Imports

In [16]:
from langchain.document_loaders import UnstructuredFileLoader
from unstructured.cleaners.core import clean_extra_whitespace
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings
import os

## Setup the UnstructuredFileLoader to process a HTML file

In [31]:
loader = UnstructuredFileLoader(
    file_path="data/user/cb-smarter-building-dhc-service-3.html",
    strategy="hi-res",  # other option:"fast"
    mode="single",  # single (default), elements, paged (for PDFs)
    post_processors=[clean_extra_whitespace],
)

docs = loader.load()

## Iterate through the HTML file and extract the data

In [32]:
for doc in docs:
    print(doc)

page_content='Crossbreed Smarter Building DHC Service\n\nMario Toffia\n\nmario.toffia@crossbreed.se\n\nTable of Contents\n\nAbstract\n\nCapabilities Energy efficiency Operation of heating system Calculation\n\nProcesses Communication of Data-point from Controller Resolution Settings Algorithm\n\nSensors\n\nOnboarding\n\nColophon\n\nThis chapter outlines Crossbreed Smarter Building DHC [1] Service and the capabilities and processes connected to it.\n\nAbstract\n\nCrossbreed Smarter Building DHC, is an OEM service for district heating equipment providers offered to their customers, the property owners or energy operators, enabling energy optimization and monitoring of district heating systems in buildings.\n\nThanks to a combination of different securely integrated data sources and embedded AI algorithms, the Service supports the heating system to reduce energy usage in buildings without renouncing comfort and indoor climate.\n\nParallel to lowering the energy costs for building owners a

## Use Recursive Character Splitter for HTML docs

In [46]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100, add_start_index=True)
splits = text_splitter.split_documents(docs)

for split in splits:
    print(split)

page_content='Crossbreed Smarter Building DHC Service\n\nMario Toffia\n\nmario.toffia@crossbreed.se\n\nTable of Contents\n\nAbstract\n\nCapabilities Energy efficiency Operation of heating system Calculation\n\nProcesses Communication of Data-point from Controller Resolution Settings Algorithm\n\nSensors\n\nOnboarding\n\nColophon' metadata={'source': 'data/user/cb-smarter-building-dhc-service-3.html', 'start_index': 0}
page_content='Sensors\n\nOnboarding\n\nColophon\n\nThis chapter outlines Crossbreed Smarter Building DHC [1] Service and the capabilities and processes connected to it.\n\nAbstract' metadata={'source': 'data/user/cb-smarter-building-dhc-service-3.html', 'start_index': 269}
page_content='Abstract\n\nCrossbreed Smarter Building DHC, is an OEM service for district heating equipment providers offered to their customers, the property owners or energy operators, enabling energy optimization and monitoring of district heating systems in buildings.' metadata={'source': 'data/user

## Load the docs into chroma db

In [49]:
import uuid

embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])

# Create a list of unique ids for each document based on the content
ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, splits.page_content)) for splits in splits]
unique_ids = list(set(ids))
print(unique_ids)
# Ensure that only docs that correspond to unique ids are kept and that only 
# one of the duplicate ids is kept
seen_ids = set()
unique_docs = [splits for splits, id in zip(splits, ids) if id not in seen_ids and (seen_ids.add(id) or True)]

# Add the unique documents to your database
db = Chroma.from_documents(unique_docs, embeddings, ids=unique_ids)

['2dccf0ac-ba9d-5970-b5ac-c58a64f89ff6', '9183f1f9-1fb0-5fd8-95ce-e07819bb2bae', '57954a24-723c-5d65-95d0-1b2382456819', 'c4743e1e-adc8-5aaf-946e-87af1053fa08', '7ed61562-828a-566f-aef8-c32e22ce283d', 'd06de873-e2a0-57ea-bff1-353704a60763', '3c4c1d68-abb8-59d4-a381-a615a22fb6f9', '9cbc2e17-9d47-5720-92b3-aca47ed31c1e', '5a1a143d-05a6-5202-ab30-2c5aa5aecbb5', '4c7aeef1-7f33-5ca5-892a-eb59e10e6fca', '2f35a3b3-e0cd-5eae-b2bb-5d77240a9b41', '37effc08-2e41-5168-a905-3f36e8eb0a96', '1a055656-a4c9-5a0f-851d-2eedb111d8cf', '84139345-41c1-5b26-8ace-79be95056048', 'cc292c7e-7b80-50e6-bff0-5609a85390e5', 'a6c7c2bd-97b1-5b8e-a69d-704276d7c3ac', '19d9be82-e44d-5425-a103-0fa00491ff09', 'b15fd2a1-ce8a-5002-a19a-6f3e117b7be4', 'f10846c6-4518-5c7e-8f41-37442b38ca1c', 'b41b6364-6f53-51d2-a6ae-31fea0b24b51', 'f3b95746-917a-546f-9202-1496444560ae', '00961037-e281-5251-b2c3-2c0a015544c6', '01586272-13ca-5455-98e6-ac7a7b50ed38', '87ba0213-2a56-5839-b5a4-5230ccb514b8', '6127082b-5022-5515-9c97-f406a8ab638a',

## Try to retrieve the docs from chroma db

In [87]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 6})

def search(query):
    result = retriever.get_relevant_documents(query)
    filtered_result = []
    seen_ids = set()

    for res in result:
        if str(uuid.uuid5(uuid.NAMESPACE_DNS, res.page_content)) not in seen_ids:
            filtered_result.append(res)
            seen_ids.add(str(uuid.uuid5(uuid.NAMESPACE_DNS, res.page_content)))
    
    return filtered_result

def search_text(query) -> str:
    text = ""
    for res in search(query):
        text += res.page_content + "\n"
    return text


## Create Chat Prompt

In [90]:
from langchain.prompts import ChatPromptTemplate

template = """{context}

------------------
Using the context above, please answer the following question:

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

## Create Chain

In [92]:
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.globals import set_debug, set_verbose
from langchain.schema.runnable import RunnablePassthrough

set_debug(True)
set_verbose(True)

chain = RunnablePassthrough.assign(
  context=lambda x: search_text(x["question"])
) | prompt | ChatOpenAI(model="gpt-4", temperature=0) | StrOutputParser()

chain.invoke({  
  "question": "Which datapoints are for primary circuit?",
})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "Which datapoints are for primary circuit?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel] Entering Chain run with input:
[0m{
  "question": "Which datapoints are for primary circuit?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 3:chain:<lambda>] Entering Chain run with input:
[0m{
  "question": "Which datapoints are for primary circuit?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 4:chain:<lambda>] Entering Chain run with input:
[0m{
  "question": "Which datapoints are for primary circuit?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 4:chain:<lambda>] [0ms] Exiting Chain run with output:
[0m{
  "output": "Which datapoints are for primary circuit?"
}
[36;1m[1;3m[chain/end][0m [1m[1:ch

'The datapoints for the primary circuit are Supply temperature primary side, Return temperature primary side, Aggregated massflow primary side, Massflow primary side, Aggregated energy usage primary side, and Heat demand primary side.'