In [1]:
#Run this cell to install the necessary packages
import subprocess
import pkg_resources

def install_if_needed(package, version):
    '''Function to ensure that the libraries used are consistent to avoid errors.'''
    try:
        pkg = pkg_resources.get_distribution(package)
        if pkg.version != version:
            raise pkg_resources.VersionConflict(pkg, version)
    except (pkg_resources.DistributionNotFound, pkg_resources.VersionConflict):
        subprocess.check_call(["pip", "install", f"{package}=={version}"])

install_if_needed("langchain-core", "0.3.18")
install_if_needed("langchain-openai", "0.2.8")
install_if_needed("langchain-community", "0.3.7")
install_if_needed("unstructured", "0.14.4")
install_if_needed("langchain-chroma", "0.1.4")
install_if_needed("langchain-text-splitters", "0.3.2")

In [3]:
# Set your API key to a variable
from dotenv import load_dotenv
import os

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file")

# Import the required packages
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_openai import OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

In [4]:
# Load the HTML as a LangChain document loader
loader = UnstructuredHTMLLoader(file_path="data/mg-zs-warning-messages.html")
car_docs = loader.load()

In [5]:
# Initialize RecursiveCharacterTextSplitter to make chunks of HTML text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)


In [6]:
# Split GDPR HTML
splits = text_splitter.split_documents(car_docs)

In [8]:
# After splitting, print each chunk with visible separators
for i, chunk in enumerate(splits[:5]):  # Show first 5 chunks for demo
    print(f"\n\n=== Chunk {i+1} (Length: {len(chunk.page_content)} chars) ===")
    print(chunk.page_content[:200] + "..." if len(chunk.page_content) > 200 else chunk.page_content)  # Show preview
    print("...")  # Indicates continuation if truncated



=== Chunk 1 (Length: 782 chars) ===
...


=== Chunk 2 (Length: 652 chars) ===
...


=== Chunk 3 (Length: 613 chars) ===
...


=== Chunk 4 (Length: 491 chars) ===
...


=== Chunk 5 (Length: 604 chars) ===
...


In [None]:
# Initialize Chroma vectorstore with documents as splits and using OpenAIEmbeddings
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY")))

In [None]:
# Setup vectorstore as retriever
retriever = vectorstore.as_retriever()

In [None]:
# Define RAG prompt
prompt = ChatPromptTemplate.from_template("You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:")

In [None]:
# Initialize chat-based LLM with 0 temperature and using gpt-4o-mini
model = ChatOpenAI(openai_api_key=openai_api_key, model_name="gpt-4o-mini", temperature=0)

In [None]:
# Setup the chain
rag_chain = (
    {"context": retriever , "question": RunnablePassthrough()}
    | prompt
    | model
)

In [None]:
# Initialize query
query = "The Gasoline Particular Filter Full warning has appeared. What does this mean and what should I do about it?"

In [None]:
# Invoke the query
answer = rag_chain.invoke(query).content
print(answer)