In [5]:
!pip install requests langchain_community langchain_chroma pypdf cryptography chromadb sentence-transformers
!pip freeze > requirements.txt

Collecting sentence-transformers
  Using cached sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.53.3-py3-none-any.whl.metadata (40 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.7.1-cp313-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.16.0-cp313-cp313-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting Pillow (from sentence-transformers)
  Using cached pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.

In [2]:
from download_and_extract_ncc import download_and_extract_ncc
from load_embed_pdfs import load_and_embed_pdfs_to_chroma

#### Variables to test:
- Chunking
- Embedding model
- Language model


In [3]:
ncc_zip_url = "https://ncc.abcb.gov.au/system/files/ncc/ncc2022-complete-series-20230501b.zip" 
# Directory to save the extracted PDFs
pdf_directory = "NCC_PDFs"

# Call the function to download and extract
download_and_extract_ncc(ncc_zip_url, save_directory=pdf_directory)

Attempting to download from: https://ncc.abcb.gov.au/system/files/ncc/ncc2022-complete-series-20230501b.zip
Successfully downloaded NCC_Complete_Series.zip to NCC_PDFs/NCC_Complete_Series.zip
Extracting contents of NCC_Complete_Series.zip...
Extracted: abcb-housing-provisions-2022-20230501b.pdf
Extracted: ncc2022-combined-vol2-housing-provisions-20230501b.pdf
Extracted: ncc2022-consolidated-performance-requirements-20230501b.pdf
Extracted: ncc2022-volume-one-20230501b.pdf
Extracted: ncc2022-volume-three-20230501b.pdf
Extracted: ncc2022-volume-two-20230501b.pdf
Extraction complete.
Removed temporary ZIP file: NCC_PDFs/NCC_Complete_Series.zip


In [4]:
# List of PDF filenames to process
pdf_filenames_list = [
    "ncc2022-volume-one-20230501b.pdf",
    "ncc2022-volume-two-20230501b.pdf",
    "ncc2022-volume-three20230501b.pdf"
]

# Directory where the Chroma DB will be saved
chroma_db_output_path = "ncc_chroma_db"

# Embeddings model name
embeddings_model_name="all-MiniLM-L6-v2"

# --- Run the function ---
load_and_embed_pdfs_to_chroma(
    pdf_filenames=pdf_filenames_list,
    pdf_directory=pdf_directory,
    embeddings_model_name=embeddings_model_name,
    chroma_db_path=chroma_db_output_path,
)

--- Starting PDF Processing and Embedding ---
Loading and chunking 'ncc2022-volume-one-20230501b.pdf' by page...
  - Loaded 884 pages from 'ncc2022-volume-one-20230501b.pdf'.
Loading and chunking 'ncc2022-volume-two-20230501b.pdf' by page...


Impossible to decode XFormObject /Fm0: Invalid Elementary Object starting with b')' @6: b"r\xf8\x08\xbe\xd4\x98)H\xd5\xd8\xeb\xbb\xe7&\xf7:\xaa\xfc\xd4R\x98\xa9\x13+\xf0\xeb\xf7\xd8\xc4\xcd\xe7\x01\r\xef\xb6\xb4>\x1b\x01\x08B\xbe\xf4\x0f\xee\x86b\x89\xa1\x83\xf1n\tI\xd4\xf3\xe4\xe8r\xf6K\xe0\xebe\xe5\xdf\x1e\x12\x84|\xb1\xb0\xe9m9\xe7'\xa1\xc0\x94"


  - Loaded 312 pages from 'ncc2022-volume-two-20230501b.pdf'.
Error: PDF file not found at 'NCC_PDFs/ncc2022-volume-three20230501b.pdf'. Skipping.

Total pages loaded across all PDFs: 1196
Initializing embedding model (this might take a moment the first time)...


  embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)


Embedding model initialized.
Initializing Chroma DB at 'ncc_chroma_db'...
Chroma DB created/loaded and pages embedded.
You can now find the Chroma DB files in the 'ncc_chroma_db' directory.


In [17]:
from chroma_db_query import query_chroma_db

prompt = "How do I assess if a wall is fire-resisting?"
outputs = query_chroma_db(prompt, chroma_db_output_path, embeddings_model_name)

1. File: NCC_PDFs/ncc2022-volume-one-20230501b.pdf
   Content: Fire resistance
NCC 2022 Volume One - Building Code of Australia Page 149
  
 
 S6C1   Scope 
[2019: Spec C1.8: 1] 
This Specification describes tests to be applied to and criteria to be satisfied by ...

2. File: NCC_PDFs/ncc2022-volume-one-20230501b.pdf
   Content: Fire resistance
NCC 2022 Volume One - Building Code of Australia Page 149
  
 
 S6C1   Scope 
[2019: Spec C1.8: 1] 
This Specification describes tests to be applied to and criteria to be satisfied by ...

3. File: NCC_PDFs/ncc2022-volume-one-20230501b.pdf
   Content: Fire resistance
NCC 2022 Volume One - Building Code of Australia Page 151
with, and in a manner identical with, the actual construction. 
If the distance between supports of the actual construction is...

4. File: NCC_PDFs/ncc2022-volume-one-20230501b.pdf
   Content: Fire resistance
NCC 2022 Volume One - Building Code of Australia Page 151
with, and in a manner identical with, the actual constructi

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load SmolLm3 model and tokenizer from HuggingFace
model_name = "HuggingFaceTB/SmolLM3-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



In [None]:
from transformers import pipeline

generator = pipeline('text-generation', model="Qwen/Qwen3-8B")

Fetching 5 files:   0%|          | 0/5 [52:17<?, ?it/s]


In [None]:
# Prepare context from outputs (RAG)
context = " ".join([o.page_content for o in outputs])

# Construct the prompt for RAG
rag_prompt = f"System: You are a helpful chat bot that uses context from the 2022 Australian National Construction Code to answer questions. \n\n Context:\n{context}\n\nQuestion: {prompt}\nAnswer:"

In [None]:
generated_text = generator(rag_prompt, max_new_tokens=50)

equired to be fire-resisting and which bounds a lift shaft, stair shaft, or service shaft, fire-isolated passageway or fire-isolated ramp must be subjected to a series of tests and must fulfil certain


In [None]:
print(generated_text[0]['generated_text'][-1000:])

 and installation of sheet roof and wall 
cladding — Metal (See Note 2)
B1D4, F3D2, 
F3D5
H1D7 N/A N/A
AS1562 Part 3 2006 Design and installation of sheet roof and wall 
cladding — Plastic
B1D4, F3D2 H1D7 N/A N/A
AS 1657 2018 Fixed platforms, walkways, stairways and 
ladders — Design, construction and 
installation
D2D21, D2D22, 
D3D23, I1D6, 
I3D5
N/A N/A N/A
AS/NZS 1664 Part 
1
1997 Aluminium structures — Limit state design 
(incorporating amendment 1)
B1D4 N/A 2.2.4 N/A
AS/NZS 1664 Part 
2
1997 Aluminium structures — Allowable stress 
design (incorporating amendment 1)
B1D4 N/A 2.2.4 N/A
Referenced documents
NCC 2022 Volume One - Building Code of Australia Page 602 
 (1 May 2023)

Question: How do I assess if a wall is fire-resisting?
Answer: A wall of lightweight construction that is required to be fire-resisting and which bounds a lift shaft, stair shaft, or service shaft, fire-isolated passageway or fire-isolated ramp must be subjected to a series of tests and must fulfil certain

In [9]:
# Construct the prompt for RAG
rag_prompt = f"Context:\n{context}\n\nQuestion: {prompt}\nAnswer:"

# Tokenize and generate answer
inputs = tokenizer(rag_prompt, return_tensors="pt", truncation=True, max_length=2048)
with torch.no_grad():
    output_ids = model.generate(**inputs, max_new_tokens=256)
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(answer)

Context:
Fire resistance
NCC 2022 Volume One - Building Code of Australia Page 149
  
 
 S6C1   Scope 
[2019: Spec C1.8: 1] 
This Specification describes tests to be applied to and criteria to be satisfied by a wall system of lightweight construction. 
 S6C2   Application 
[2019: Spec C1.8: 2] 
A wall system need not be tested in accordance with this Specification for static pressure or impact if it is designed and  
constructed in accordance with the Deemed-to-Satisfy Provisions of Part B1 to resist the appropriate pressures and impacts 
defined in this Specification. 
 
 
 
 S6C3   Walls of certain Class 9b buildings 
[2019: Spec C1.8: 3.1] 
(1) Lightweight construction forming—  
a wall of a lift shaft and stair shaft; and (a)
an external and internal wall bounding a public corridor, public lobby or the like, including a fire-isolated and non (b)
fire-isolated passageway or ramp, 
in spectator stand, sports stadium, cinema or theatre, railway or bus station or airport terminal, must

In [10]:
print(answer[-2000:])

ing loose, dry sand into the bag and must be adjusted before each series (iii)
of impact tests; and 
where the impact bag and suspension cannot be vertical at the instant of impact on a curved surface or an (iv)
inclined surface, the height of drop is the net height at the point of impact. 
For resistance to surface indentation — The test for resistance to surface indentation must be carried out at  (d)
three points on the surface of an undamaged sample sheet as follows: 
A steel ball of 10 mm diameter with a load of 150 N must be placed gently on the surface of the sheet and (i)
allowed to remain in position for 5 minutes. 
The ball and load must then be removed and the diameter of each impression of the ball on the surface  (ii)
measured. 
For resistance of lift shaft construction to repetitive load — As for (b) except that— (e)
it is sufficient to test one specimen with the pressure applied from the side of the construction on which the (i)
lift will operate; and 
the load must be a